···11+# html5rw - Pure OCaml HTML5 Parser
22+33+A pure OCaml HTML5 parser implementing the WHATWG HTML5 parsing specification. This library passes the html5lib-tests suite and provides full support for tokenization, tree construction, encoding detection, and CSS selector queries.
44+55+## Key Features
66+77+- **WHATWG Compliant**: Implements the full HTML5 parsing algorithm with proper error recovery
88+- **CSS Selectors**: Query the DOM using standard CSS selector syntax
99+- **Streaming I/O**: Uses bytesrw for efficient streaming input/output
1010+- **Encoding Detection**: Automatic character encoding detection following the WHATWG algorithm
1111+- **Entity Decoding**: Complete HTML5 named character reference support
1212+1313+## Usage
1414+1515+```ocaml
1616+open Bytesrw
1717+1818+(* Parse HTML from a string *)
1919+let html = "<html><body><p>Hello, world!</p></body></html>"
2020+let reader = Bytes.Reader.of_string html
2121+let doc = Html5rw.parse reader
2222+2323+(* Query with CSS selectors *)
2424+let paragraphs = Html5rw.query doc "p"
2525+2626+(* Extract text content *)
2727+let text = Html5rw.to_text doc
2828+2929+(* Serialize back to HTML *)
3030+let output = Html5rw.to_string doc
3131+```
3232+3333+For fragment parsing (innerHTML):
3434+3535+```ocaml
3636+(* Parse as innerHTML of a <div> *)
3737+let ctx = Html5rw.make_fragment_context ~tag_name:"div" ()
3838+let reader = Bytes.Reader.of_string "<p>Fragment content</p>"
3939+let doc = Html5rw.parse ~fragment_context:ctx reader
4040+```
4141+4242+## Installation
4343+4444+```
4545+opam install html5rw
4646+```
4747+4848+## Documentation
4949+5050+API documentation is available via:
5151+5252+```
5353+opam install html5rw
5454+odig doc html5rw
5555+```
5656+5757+## License
5858+5959+MIT
+13-8
dune-project
···11-(lang dune 3.0)
11+(lang dune 3.20)
22+23(name html5rw)
33-(version 0.1.0)
4455(generate_opam_files true)
6677-(source (github username/html5rw))
87(license MIT)
99-(authors "Author")
1010-(maintainers "author@example.com")
88+(authors "Anil Madhavapeddy <anil@recoil.org>")
99+(homepage "https://tangled.org/@anil.recoil.org/ocaml-html5rw")
1010+(maintainers "Anil Madhavapeddy <anil@recoil.org>")
1111+(bug_reports "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues")
1212+(maintenance_intent "(latest)")
11131214(package
1315 (name html5rw)
1416 (synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification")
1515- (description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.")
1717+ (description
1818+ "A pure OCaml HTML5 parser that passes the html5lib-tests suite. \
1919+ Implements the WHATWG HTML5 parsing specification including tokenization, \
2020+ tree construction, encoding detection, and CSS selector queries.")
1621 (depends
1717- (ocaml (>= 4.14.0))
2222+ (ocaml (>= 5.1.0))
1823 (bytesrw (>= 0.3.0))
1924 (uutf (>= 1.0.0))
2020- (re (>= 1.10.0))
2525+ (odoc :with-doc)
2126 (jsont (and :with-test (>= 0.2.0)))))
···11-(* html5rw.dom - HTML5 DOM types and operations *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** HTML5 DOM Types and Operations
77+88+ This module provides DOM manipulation functions for HTML5 documents.
99+ It includes node creation, tree traversal, attribute manipulation,
1010+ and serialization.
1111+*)
212313include Node
414
+5
lib/dom/node.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(* HTML5 DOM node types *)
2738type doctype_data = {
+5
lib/dom/node.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(** HTML5 DOM Node Types and Operations
2738 This module provides the DOM node representation used by the HTML5 parser.
+5
lib/dom/serialize.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(* HTML5 DOM serialization *)
2738open Bytesrw
+89-7
lib/encoding/html5rw_encoding.ml
···11-(* html5rw.encoding - HTML5 encoding detection and decoding *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** HTML5 Encoding Detection and Decoding
77+88+ This module implements the WHATWG encoding sniffing and decoding
99+ algorithms for HTML5 documents. It handles automatic character
1010+ encoding detection from byte order marks (BOM), meta charset
1111+ declarations, and transport layer hints.
1212+1313+ {2 Encoding Detection Algorithm}
1414+1515+ The encoding detection follows the WHATWG specification:
1616+ 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE)
1717+ 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">]
1818+ 3. Use transport layer encoding hint if provided
1919+ 4. Fall back to UTF-8 as the default
2020+2121+ @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
2222+ WHATWG encoding sniffing algorithm
2323+*)
2242525+(** {1 Types} *)
2626+2727+(** Character encodings supported by the parser.
2828+2929+ The HTML5 specification requires support for a large number of
3030+ encodings, but this implementation focuses on the most common ones.
3131+ Other encodings are mapped to their closest equivalent.
3232+*)
333type encoding = Encoding.t =
44- | Utf8
55- | Utf16le
66- | Utf16be
77- | Windows_1252
88- | Iso_8859_2
99- | Euc_jp
3434+ | Utf8 (** UTF-8 encoding (default) *)
3535+ | Utf16le (** UTF-16 little-endian *)
3636+ | Utf16be (** UTF-16 big-endian *)
3737+ | Windows_1252 (** Windows-1252 (Latin-1 superset) *)
3838+ | Iso_8859_2 (** ISO-8859-2 (Central European) *)
3939+ | Euc_jp (** EUC-JP (Japanese) *)
4040+4141+(** {1 Encoding Utilities} *)
4242+4343+(** Convert an encoding to its canonical label string.
10444545+ Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
4646+*)
1147let encoding_to_string = Encoding.to_string
12484949+(** Detect encoding from a byte order mark.
5050+5151+ Examines the first bytes of the input for a BOM and returns the
5252+ detected encoding with the number of bytes to skip.
5353+5454+ @return [(Some (encoding, skip_bytes))] if a BOM is found,
5555+ [None] otherwise.
5656+*)
1357let sniff_bom = Bom.sniff
14585959+(** Normalize an encoding label to its canonical form.
6060+6161+ Maps encoding labels (case-insensitive, with optional whitespace)
6262+ to the supported encoding types.
6363+6464+ @return [Some encoding] if the label is recognized, [None] otherwise.
6565+6666+ {[
6767+ normalize_label "UTF-8" (* Some Utf8 *)
6868+ normalize_label "utf8" (* Some Utf8 *)
6969+ normalize_label "latin1" (* Some Windows_1252 *)
7070+ ]}
7171+*)
1572let normalize_label = Labels.normalize_label
16737474+(** Prescan bytes to find a meta charset declaration.
7575+7676+ Implements the WHATWG prescan algorithm that looks for encoding
7777+ declarations in the first 1024 bytes of an HTML document.
7878+7979+ @return [Some encoding] if a meta charset is found, [None] otherwise.
8080+*)
1781let prescan_for_meta_charset = Prescan.prescan_for_meta_charset
18828383+(** {1 Decoding} *)
8484+8585+(** Decode raw bytes to a UTF-8 string with automatic encoding detection.
8686+8787+ This function implements the full encoding sniffing algorithm:
8888+ 1. Check for BOM
8989+ 2. Prescan for meta charset
9090+ 3. Use transport encoding hint if provided
9191+ 4. Fall back to UTF-8
9292+9393+ @param transport_encoding Encoding hint from HTTP Content-Type header
9494+ @return [(decoded_string, detected_encoding)]
9595+9696+ {[
9797+ let (html, enc) = decode raw_bytes ()
9898+ (* html is now a UTF-8 string, enc is the detected encoding *)
9999+ ]}
100100+*)
19101let decode = Decode.decode
+90-1
lib/entities/html5rw_entities.ml
···11-(* html5rw.entities - HTML5 entity decoding *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** HTML5 Named Character Reference Decoding
77+88+ This module provides functions for decoding HTML5 named character
99+ references (entities) and numeric character references. It includes
1010+ the complete table of 2,231 named character references defined in
1111+ the WHATWG HTML5 specification.
1212+1313+ {2 Character Reference Types}
1414+1515+ HTML5 supports three types of character references:
1616+1717+ {3 Named References}
1818+ - Standard form: [&], [<], [>], [ ]
1919+ - Some entities have multiple codepoint outputs: [⪡̸]
2020+2121+ {3 Decimal Numeric References}
2222+ - Form: [{] (decimal codepoint)
2323+2424+ {3 Hexadecimal Numeric References}
2525+ - Form: [{] or [{] (hexadecimal codepoint)
2626+2727+ {2 Legacy Entity Handling}
2828+2929+ Some named entities are "legacy" - they were supported without a
3030+ trailing semicolon in older browsers (e.g., [&] instead of [&]).
3131+ The parser handles these according to the WHATWG specification.
3232+3333+ @see <https://html.spec.whatwg.org/multipage/named-characters.html>
3434+ The complete list of named character references
3535+*)
3636+3737+(** {1 Decoding Functions} *)
3838+3939+(** Decode all character references in a text string.
4040+4141+ Processes the string and replaces all valid character references
4242+ (named and numeric) with their decoded UTF-8 equivalents.
4343+4444+ {[
4545+ decode "Hello & goodbye"
4646+ (* Returns: "Hello & goodbye" *)
2474848+ decode "<script>"
4949+ (* Returns: "<script>" *)
5050+ ]}
5151+*)
352let decode = Decode.decode_entities_in_text
4535454+(** Decode a numeric character reference.
5555+5656+ @param codepoint The Unicode codepoint to decode
5757+ @return The UTF-8 string representation
5858+5959+ Note: Some codepoints are replaced according to the HTML5
6060+ specification (e.g., control characters in the 0x80-0x9F range
6161+ are mapped to Windows-1252 equivalents).
6262+*)
563let decode_numeric = Numeric_ref.decode
6646565+(** Look up a named character reference.
6666+6767+ @param name The entity name without [&] and [;] (e.g., ["amp"])
6868+ @return [Some codepoints] if the entity exists, [None] otherwise
6969+7070+ {[
7171+ lookup "amp" (* Some [0x26] *)
7272+ lookup "nbsp" (* Some [0xA0] *)
7373+ lookup "bogus" (* None *)
7474+ ]}
7575+*)
776let lookup = Entity_table.lookup
8777878+(** Check if an entity is a legacy entity.
7979+8080+ Legacy entities are those that were historically recognized without
8181+ a trailing semicolon. The parser handles these specially to maintain
8282+ browser compatibility.
8383+8484+ {[
8585+ is_legacy "amp" (* true - & works without ; *)
8686+ is_legacy "nbsp" (* true *)
8787+ is_legacy "Aacute" (* false - requires semicolon *)
8888+ ]}
8989+*)
990let is_legacy = Entity_table.is_legacy
10919292+(** Convert a Unicode codepoint to its UTF-8 encoding.
9393+9494+ @param codepoint The Unicode codepoint (0 to 0x10FFFF)
9595+ @return The UTF-8 encoded string
9696+*)
1197let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8
12989999+(** {1 Sub-modules} *)
100100+101101+(** Numeric character reference handling. *)
13102module Numeric_ref = Numeric_ref
+5
lib/html5rw/html5rw.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(** Html5rw - Pure OCaml HTML5 Parser
2738 This module provides a complete HTML5 parsing solution following the
+5
lib/html5rw/html5rw.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(** Html5rw - Pure OCaml HTML5 Parser
2738 This module provides a complete HTML5 parsing solution following the
+5
lib/parser/html5rw_parser.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(* html5rw.parser - HTML5 parser with bytesrw-only API *)
2738module Dom = Html5rw_dom
+6-1
lib/parser/html5rw_parser.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(** HTML5 Parser
2738 This module provides the core HTML5 parsing functionality implementing
···172177173178 @raise Html5rw_selector.Selector_error if the selector is invalid
174179175175- @see {!Html5rw_selector} for supported selector syntax
180180+ See {!Html5rw_selector} for supported selector syntax.
176181*)
177182178183(** {1 Serialization} *)
+5
lib/parser/parser.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+16(* Main parser entry point - bytesrw-only API *)
2738open Bytesrw
···11-(* html5rw.selector - CSS selector engine *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
2566+(** CSS Selector Engine
77+88+ This module provides CSS selector parsing and matching for querying
99+ the HTML5 DOM. It supports a subset of CSS3 selectors suitable for
1010+ common web scraping and DOM manipulation tasks.
1111+1212+ {2 Supported Selectors}
1313+1414+ {3 Simple Selectors}
1515+ - Tag: [div], [p], [span]
1616+ - ID: [#myid]
1717+ - Class: [.myclass]
1818+ - Universal: [*]
1919+2020+ {3 Attribute Selectors}
2121+ - Presence: [[attr]]
2222+ - Exact match: [[attr="value"]]
2323+ - Contains word: [[attr~="value"]]
2424+ - Starts with: [[attr^="value"]]
2525+ - Ends with: [[attr$="value"]]
2626+ - Contains: [[attr*="value"]]
2727+ - Hyphen-separated: [[attr|="value"]]
2828+2929+ {3 Pseudo-classes}
3030+ - [:first-child], [:last-child]
3131+ - [:nth-child(n)], [:nth-last-child(n)]
3232+ - [:only-child]
3333+ - [:empty]
3434+ - [:not(selector)]
3535+3636+ {3 Combinators}
3737+ - Descendant: [div p] (p anywhere inside div)
3838+ - Child: [div > p] (p direct child of div)
3939+ - Adjacent sibling: [div + p] (p immediately after div)
4040+ - General sibling: [div ~ p] (p after div, same parent)
4141+4242+ {2 Usage}
4343+4444+ {[
4545+ let doc = Html5rw.parse reader in
4646+4747+ (* Find all paragraphs *)
4848+ let paragraphs = Html5rw.query doc "p" in
4949+5050+ (* Find links with specific class *)
5151+ let links = Html5rw.query doc "a.external" in
5252+5353+ (* Find table cells in rows *)
5454+ let cells = Html5rw.query doc "tr > td" in
5555+5656+ (* Check if a node matches *)
5757+ let is_active = Html5rw.matches node ".active"
5858+ ]}
5959+*)
6060+6161+(** {1 Exceptions} *)
6262+6363+(** Raised when a selector string is malformed.
6464+6565+ The exception contains an error message describing the parse error.
6666+*)
367exception Selector_error = Selector_lexer.Selector_error
4686969+(** {1 Sub-modules} *)
7070+7171+(** Abstract syntax tree for parsed selectors. *)
572module Ast = Selector_ast
7373+7474+(** Token types for the selector lexer. *)
675module Token = Selector_token
7767777+(** {1 Functions} *)
7878+7979+(** Parse a CSS selector string.
8080+8181+ @raise Selector_error if the selector is malformed.
8282+*)
883let parse = Selector_parser.parse_selector
9848585+(** Query the DOM tree with a CSS selector.
8686+8787+ Returns all nodes matching the selector in document order.
8888+8989+ @raise Selector_error if the selector is malformed.
9090+9191+ {[
9292+ let divs = query root_node "div.content > p"
9393+ ]}
9494+*)
1095let query = Selector_match.query
11969797+(** Check if a node matches a CSS selector.
9898+9999+ @raise Selector_error if the selector is malformed.
100100+101101+ {[
102102+ if matches node ".active" then
103103+ (* node has class "active" *)
104104+ ]}
105105+*)
12106let matches = Selector_match.matches
+15-1
lib/selector/selector_match.ml
···33module Dom = Html5rw_dom
44open Selector_ast
5566+(* Check if haystack contains needle as a substring *)
77+let string_contains ~haystack ~needle =
88+ let needle_len = String.length needle in
99+ let haystack_len = String.length haystack in
1010+ if needle_len > haystack_len then false
1111+ else if needle_len = 0 then true
1212+ else
1313+ let rec check i =
1414+ if i > haystack_len - needle_len then false
1515+ else if String.sub haystack i needle_len = needle then true
1616+ else check (i + 1)
1717+ in
1818+ check 0
1919+620let is_element node =
721 let name = node.Dom.name in
822 name <> "#text" && name <> "#comment" && name <> "#document" &&
···177191 String.sub attr_value 0 (String.length value) = value
178192 | Some "$=" -> value <> "" && String.length attr_value >= String.length value &&
179193 String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value
180180- | Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value
194194+ | Some "*=" -> value <> "" && string_contains ~haystack:attr_value ~needle:value
181195 | Some _ | None -> false))
182196 | None -> false)
183197 | Type_pseudo ->