···11+MIT License
22+33+Copyright (c) 2025 Emil Stenström
44+Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
55+66+Permission is hereby granted, free of charge, to any person obtaining a copy
77+of this software and associated documentation files (the "Software"), to deal
88+in the Software without restriction, including without limitation the rights
99+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1010+copies of the Software, and to permit persons to whom the Software is
1111+furnished to do so, subject to the following conditions:
1212+1313+The above copyright notice and this permission notice shall be included in all
1414+copies or substantial portions of the Software.
1515+1616+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1717+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1818+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1919+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2020+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2121+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222+SOFTWARE.
···11+(lang dune 3.0)
22+(name html5rw)
33+(version 0.1.0)
44+55+(generate_opam_files true)
66+77+(source (github username/html5rw))
88+(license MIT)
99+(authors "Author")
1010+(maintainers "author@example.com")
1111+1212+(package
1313+ (name html5rw)
1414+ (synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification")
1515+ (description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.")
1616+ (depends
1717+ (ocaml (>= 4.14.0))
1818+ (bytesrw (>= 0.3.0))
1919+ (uutf (>= 1.0.0))
2020+ (re (>= 1.10.0))
2121+ (yojson (and :build (>= 2.0.0)))))
+32
examples/basic_parsing.ml
···11+open Bytesrw
22+33+(* Basic HTML parsing example *)
44+55+let html = {|
66+<!DOCTYPE html>
77+<html>
88+<head>
99+ <title>Hello World</title>
1010+</head>
1111+<body>
1212+ <h1>Welcome</h1>
1313+ <p>This is a <strong>simple</strong> example.</p>
1414+</body>
1515+</html>
1616+|}
1717+1818+let () =
1919+ (* Parse HTML string *)
2020+ let result = Html5rw.parse (Bytes.Reader.of_string html) in
2121+2222+ (* Access the root document node *)
2323+ let doc = Html5rw.root result in
2424+ Printf.printf "Root node: %s\n" doc.Html5rw.Dom.name;
2525+2626+ (* Convert back to HTML *)
2727+ let output = Html5rw.to_string result in
2828+ Printf.printf "\nParsed and serialized:\n%s\n" output;
2929+3030+ (* Extract plain text *)
3131+ let text = Html5rw.to_text result in
3232+ Printf.printf "\nText content: %s\n" text
+122
examples/css_selectors.ml
···11+open Bytesrw
22+33+(* CSS selector query example *)
44+55+let html = {|
66+<!DOCTYPE html>
77+<html>
88+<head><title>Products</title></head>
99+<body>
1010+ <div class="container">
1111+ <h1 id="title">Product List</h1>
1212+ <ul class="products">
1313+ <li class="product" data-id="1">
1414+ <span class="name">Widget A</span>
1515+ <span class="price">$10.00</span>
1616+ </li>
1717+ <li class="product" data-id="2">
1818+ <span class="name">Widget B</span>
1919+ <span class="price">$15.00</span>
2020+ </li>
2121+ <li class="product featured" data-id="3">
2222+ <span class="name">Widget C</span>
2323+ <span class="price">$20.00</span>
2424+ </li>
2525+ </ul>
2626+ </div>
2727+</body>
2828+</html>
2929+|}
3030+3131+let () =
3232+ let result = Html5rw.parse (Bytes.Reader.of_string html) in
3333+3434+ (* Find element by ID *)
3535+ Printf.printf "=== ID Selector (#title) ===\n";
3636+ let titles = Html5rw.query result "#title" in
3737+ List.iter (fun node ->
3838+ Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
3939+ ) titles;
4040+4141+ (* Find elements by class *)
4242+ Printf.printf "\n=== Class Selector (.product) ===\n";
4343+ let products = Html5rw.query result ".product" in
4444+ Printf.printf "Found %d products\n" (List.length products);
4545+4646+ (* Find elements by tag *)
4747+ Printf.printf "\n=== Tag Selector (span) ===\n";
4848+ let spans = Html5rw.query result "span" in
4949+ Printf.printf "Found %d span elements\n" (List.length spans);
5050+5151+ (* Find with attribute presence *)
5252+ Printf.printf "\n=== Attribute Presence ([data-id]) ===\n";
5353+ let with_data_id = Html5rw.query result "[data-id]" in
5454+ List.iter (fun node ->
5555+ match Html5rw.get_attr node "data-id" with
5656+ | Some id -> Printf.printf "Found element with data-id=%s\n" id
5757+ | None -> ()
5858+ ) with_data_id;
5959+6060+ (* Find with attribute value *)
6161+ Printf.printf "\n=== Attribute Value ([data-id=\"3\"]) ===\n";
6262+ let featured = Html5rw.query result "[data-id=\"3\"]" in
6363+ List.iter (fun node ->
6464+ Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
6565+ ) featured;
6666+6767+ (* Find with multiple classes *)
6868+ Printf.printf "\n=== Multiple Classes (.product.featured) ===\n";
6969+ let featured_products = Html5rw.query result ".featured" in
7070+ List.iter (fun node ->
7171+ Printf.printf "Featured: %s\n" (Html5rw.get_text_content node)
7272+ ) featured_products;
7373+7474+ (* Check if a node matches a selector *)
7575+ Printf.printf "\n=== Match Check (.featured) ===\n";
7676+ List.iter (fun node ->
7777+ if Html5rw.matches node ".featured" then
7878+ Printf.printf "This product is featured!\n"
7979+ ) products;
8080+8181+ (* Pseudo-class: first-child *)
8282+ Printf.printf "\n=== Pseudo-class (:first-child) ===\n";
8383+ let first = Html5rw.query result "li:first-child" in
8484+ List.iter (fun node ->
8585+ Printf.printf "First li: %s\n" (String.trim (Html5rw.get_text_content node))
8686+ ) first;
8787+8888+ (* Pseudo-class: last-child *)
8989+ Printf.printf "\n=== Pseudo-class (:last-child) ===\n";
9090+ let last = Html5rw.query result "li:last-child" in
9191+ List.iter (fun node ->
9292+ Printf.printf "Last li: %s\n" (String.trim (Html5rw.get_text_content node))
9393+ ) last;
9494+9595+ (* Universal selector *)
9696+ Printf.printf "\n=== Universal Selector (*) ===\n";
9797+ let all = Html5rw.query result "*" in
9898+ Printf.printf "Total elements: %d\n" (List.length all);
9999+100100+ (* Combining queries: find products then filter *)
101101+ Printf.printf "\n=== Combined: Products with price > $15 ===\n";
102102+ List.iter (fun product ->
103103+ (* Find price span within this product *)
104104+ let price_spans = List.filter (fun node ->
105105+ Html5rw.matches node ".price"
106106+ ) (Html5rw.descendants product) in
107107+ List.iter (fun price_span ->
108108+ let price_text = Html5rw.get_text_content price_span in
109109+ (* Parse price - remove $ and convert *)
110110+ let price_str = String.sub price_text 1 (String.length price_text - 1) in
111111+ let price = float_of_string price_str in
112112+ if price > 15.0 then begin
113113+ let name_spans = List.filter (fun node ->
114114+ Html5rw.matches node ".name"
115115+ ) (Html5rw.descendants product) in
116116+ match name_spans with
117117+ | name :: _ ->
118118+ Printf.printf " %s: %s\n" (Html5rw.get_text_content name) price_text
119119+ | [] -> ()
120120+ end
121121+ ) price_spans
122122+ ) products
+57
examples/dom_manipulation.ml
···11+open Bytesrw
22+33+(* DOM manipulation example *)
44+55+let html = {|
66+<!DOCTYPE html>
77+<html>
88+<head><title>DOM Example</title></head>
99+<body>
1010+ <div id="content">
1111+ <p>Original content</p>
1212+ </div>
1313+</body>
1414+</html>
1515+|}
1616+1717+let () =
1818+ let result = Html5rw.parse (Bytes.Reader.of_string html) in
1919+2020+ (* Find the content div *)
2121+ match Html5rw.query result "#content" with
2222+ | content_div :: _ ->
2323+ Printf.printf "Original:\n%s\n\n" (Html5rw.Dom.to_html content_div);
2424+2525+ (* Create and append a new element *)
2626+ let new_para = Html5rw.create_element "p" () in
2727+ let text_node = Html5rw.create_text "This paragraph was added programmatically!" in
2828+ Html5rw.append_child new_para text_node;
2929+ Html5rw.set_attr new_para "class" "dynamic";
3030+ Html5rw.append_child content_div new_para;
3131+3232+ Printf.printf "After adding element:\n%s\n\n" (Html5rw.Dom.to_html content_div);
3333+3434+ (* Create an element with attributes *)
3535+ let link = Html5rw.create_element "a"
3636+ ~attrs:[("href", "https://example.com"); ("target", "_blank")] () in
3737+ Html5rw.append_child link (Html5rw.create_text "Click here");
3838+ Html5rw.append_child content_div link;
3939+4040+ Printf.printf "After adding link:\n%s\n\n" (Html5rw.Dom.to_html content_div);
4141+4242+ (* Check attributes *)
4343+ Printf.printf "Link has href: %b\n" (Html5rw.has_attr link "href");
4444+ Printf.printf "Link href value: %s\n"
4545+ (Option.value ~default:"(none)" (Html5rw.get_attr link "href"));
4646+4747+ (* Clone a node *)
4848+ let cloned = Html5rw.clone ~deep:true content_div in
4949+ Printf.printf "\nCloned node children: %d\n"
5050+ (List.length cloned.Html5rw.Dom.children);
5151+5252+ (* Get descendants *)
5353+ let all_descendants = Html5rw.descendants content_div in
5454+ Printf.printf "Total descendants: %d\n" (List.length all_descendants)
5555+5656+ | [] ->
5757+ Printf.printf "Content div not found\n"
···11+open Bytesrw
22+33+(* Encoding detection example *)
44+55+let () =
66+ Printf.printf "=== Encoding Detection ===\n\n";
77+88+ (* Parse UTF-8 bytes with BOM *)
99+ let utf8_bom = Bytes.of_string "\xEF\xBB\xBF<html><body>UTF-8 with BOM</body></html>" in
1010+ let result = Html5rw.parse_bytes utf8_bom in
1111+ (match Html5rw.encoding result with
1212+ | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
1313+ | None -> Printf.printf "No encoding detected\n");
1414+ Printf.printf "Text: %s\n\n" (Html5rw.to_text result);
1515+1616+ (* Parse with meta charset *)
1717+ let meta_charset = Bytes.of_string {|
1818+ <html>
1919+ <head><meta charset="utf-8"></head>
2020+ <body>Encoding from meta tag</body>
2121+ </html>
2222+ |} in
2323+ let result2 = Html5rw.parse_bytes meta_charset in
2424+ (match Html5rw.encoding result2 with
2525+ | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
2626+ | None -> Printf.printf "No encoding detected\n");
2727+ Printf.printf "Text: %s\n\n" (Html5rw.to_text result2);
2828+2929+ (* Using low-level encoding functions *)
3030+ Printf.printf "=== Low-level Encoding API ===\n\n";
3131+3232+ let bytes = Bytes.of_string "\xEF\xBB\xBFHello" in
3333+ (match Html5rw.Encoding.sniff_bom bytes with
3434+ | Some (enc, offset) ->
3535+ Printf.printf "BOM sniffing result: %s (skip %d bytes)\n"
3636+ (Html5rw.Encoding.encoding_to_string enc) offset
3737+ | None ->
3838+ Printf.printf "No BOM detected\n");
3939+4040+ let html_bytes = Bytes.of_string {|<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">|} in
4141+ (match Html5rw.Encoding.prescan_for_meta_charset html_bytes with
4242+ | Some enc -> Printf.printf "Prescan found: %s\n" (Html5rw.Encoding.encoding_to_string enc)
4343+ | None -> Printf.printf "No charset in prescan\n")
+52
examples/error_handling.ml
···11+open Bytesrw
22+33+(* Error handling and malformed HTML example *)
44+55+let malformed_html = {|
66+<html>
77+<head>
88+ <title>Unclosed title
99+ <meta charset="utf-8">
1010+</head>
1111+<body>
1212+ <div>
1313+ <p>Unclosed paragraph
1414+ <p>Another paragraph (implicitly closes the previous one)
1515+ <span><div>Misnested tags</span></div>
1616+ </div>
1717+ <table>
1818+ <tr><td>Cell 1<td>Cell 2</td>
1919+ </table>
2020+ <!-- Unclosed comment
2121+</body>
2222+</html>
2323+|}
2424+2525+let () =
2626+ Printf.printf "=== Parsing Malformed HTML ===\n\n";
2727+2828+ (* Parse with error collection enabled *)
2929+ let result = Html5rw.parse ~collect_errors:true (Bytes.Reader.of_string malformed_html) in
3030+3131+ (* Get parse errors *)
3232+ let errs = Html5rw.errors result in
3333+ Printf.printf "Parse errors: %d\n\n" (List.length errs);
3434+ List.iter (fun err ->
3535+ Printf.printf " Line %d, Col %d: %s\n"
3636+ (Html5rw.error_line err)
3737+ (Html5rw.error_column err)
3838+ (Html5rw.error_code err)
3939+ ) errs;
4040+4141+ (* The parser still produces a valid DOM tree *)
4242+ Printf.printf "\n=== Recovered DOM Tree ===\n";
4343+ let html = Html5rw.to_string ~pretty:true ~indent_size:2 result in
4444+ Printf.printf "%s\n" html;
4545+4646+ (* Query the recovered tree *)
4747+ Printf.printf "\n=== Query Results ===\n";
4848+ let paragraphs = Html5rw.query result "p" in
4949+ Printf.printf "Found %d paragraphs\n" (List.length paragraphs);
5050+5151+ let cells = Html5rw.query result "td" in
5252+ Printf.printf "Found %d table cells\n" (List.length cells)
+11
examples/fragment_parsing.ml
···11+open Bytesrw
22+33+(** Example: Parsing HTML fragments *)
44+55+let () =
66+ let fragment = "<li>Item 1</li><li>Item 2</li>" in
77+ let context = Html5rw.make_fragment_context ~tag_name:"ul" () in
88+ let reader = Bytes.Reader.of_string fragment in
99+ let result = Html5rw.parse ~fragment_context:context reader in
1010+1111+ Printf.printf "Fragment parsing result:\n%s\n" (Html5rw.to_string result)
+69
examples/text_extraction.ml
···11+open Bytesrw
22+33+(* Text extraction example *)
44+55+let html = {|
66+<!DOCTYPE html>
77+<html>
88+<head>
99+ <title>Article</title>
1010+ <style>body { font-family: sans-serif; }</style>
1111+ <script>console.log("Hello");</script>
1212+</head>
1313+<body>
1414+ <article>
1515+ <h1>The Great HTML5 Parser</h1>
1616+ <p class="intro">
1717+ This is the <em>introduction</em> to an article about
1818+ <strong>HTML parsing</strong> in OCaml.
1919+ </p>
2020+ <p class="content">
2121+ The parser follows the WHATWG specification and handles
2222+ all kinds of malformed HTML gracefully.
2323+ </p>
2424+ <ul>
2525+ <li>Feature 1: Fast parsing</li>
2626+ <li>Feature 2: CSS selectors</li>
2727+ <li>Feature 3: Encoding detection</li>
2828+ </ul>
2929+ </article>
3030+ <footer>
3131+ <p>Copyright 2024</p>
3232+ </footer>
3333+</body>
3434+</html>
3535+|}
3636+3737+let () =
3838+ let result = Html5rw.parse (Bytes.Reader.of_string html) in
3939+4040+ (* Extract all text *)
4141+ Printf.printf "=== All Text (default) ===\n";
4242+ let text = Html5rw.to_text result in
4343+ Printf.printf "%s\n\n" text;
4444+4545+ (* Extract text with custom separator *)
4646+ Printf.printf "=== Text with Newline Separator ===\n";
4747+ let text = Html5rw.to_text ~separator:"\n" result in
4848+ Printf.printf "%s\n\n" text;
4949+5050+ (* Extract text from specific element *)
5151+ Printf.printf "=== Article Text Only ===\n";
5252+ let articles = Html5rw.query result "article" in
5353+ List.iter (fun article ->
5454+ let text = Html5rw.get_text_content article in
5555+ Printf.printf "%s\n" text
5656+ ) articles;
5757+5858+ (* Extract structured data *)
5959+ Printf.printf "\n=== Structured Extraction ===\n";
6060+ let headings = Html5rw.query result "h1" in
6161+ List.iter (fun h ->
6262+ Printf.printf "Title: %s\n" (Html5rw.get_text_content h)
6363+ ) headings;
6464+6565+ let items = Html5rw.query result "li" in
6666+ Printf.printf "Features:\n";
6767+ List.iter (fun li ->
6868+ Printf.printf " - %s\n" (Html5rw.get_text_content li)
6969+ ) items
+170
examples/web_scraper.ml
···11+open Bytesrw
22+33+(* Practical web scraping example *)
44+55+let sample_page = {|
66+<!DOCTYPE html>
77+<html lang="en">
88+<head>
99+ <meta charset="UTF-8">
1010+ <title>Tech News - Latest Stories</title>
1111+</head>
1212+<body>
1313+ <header>
1414+ <nav>
1515+ <a href="/">Home</a>
1616+ <a href="/news">News</a>
1717+ <a href="/about">About</a>
1818+ </nav>
1919+ </header>
2020+2121+ <main>
2222+ <article class="story featured">
2323+ <h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2>
2424+ <p class="summary">Scientists announce major advancement in machine learning...</p>
2525+ <span class="author">By Jane Smith</span>
2626+ <time datetime="2024-01-15">January 15, 2024</time>
2727+ </article>
2828+2929+ <article class="story">
3030+ <h2><a href="/story/2">New Programming Language Released</a></h2>
3131+ <p class="summary">The language promises 10x developer productivity...</p>
3232+ <span class="author">By John Doe</span>
3333+ <time datetime="2024-01-14">January 14, 2024</time>
3434+ </article>
3535+3636+ <article class="story">
3737+ <h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2>
3838+ <p class="summary">Community celebrates 1 million downloads...</p>
3939+ <span class="author">By Alice Chen</span>
4040+ <time datetime="2024-01-13">January 13, 2024</time>
4141+ </article>
4242+ </main>
4343+4444+ <aside>
4545+ <h3>Popular Tags</h3>
4646+ <ul class="tags">
4747+ <li><a href="/tag/ai">AI</a></li>
4848+ <li><a href="/tag/programming">Programming</a></li>
4949+ <li><a href="/tag/opensource">Open Source</a></li>
5050+ </ul>
5151+ </aside>
5252+</body>
5353+</html>
5454+|}
5555+5656+type story = {
5757+ title: string;
5858+ url: string;
5959+ summary: string;
6060+ author: string;
6161+ date: string;
6262+ featured: bool;
6363+}
6464+6565+(* Helper to find first child element with given tag name *)
6666+let find_child_by_tag parent tag =
6767+ List.find_opt (fun n ->
6868+ Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag
6969+ ) parent.Html5rw.Dom.children
7070+7171+(* Helper to find first descendant element with given tag name *)
7272+let rec find_descendant_by_tag node tag =
7373+ let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
7474+ match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with
7575+ | Some found -> Some found
7676+ | None ->
7777+ List.find_map (fun child -> find_descendant_by_tag child tag) children
7878+7979+(* Helper to find first descendant with given class *)
8080+let rec find_by_class node cls =
8181+ let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
8282+ let has_class n =
8383+ match Html5rw.get_attr n "class" with
8484+ | Some classes -> List.mem cls (String.split_on_char ' ' classes)
8585+ | None -> false
8686+ in
8787+ match List.find_opt has_class children with
8888+ | Some found -> Some found
8989+ | None ->
9090+ List.find_map (fun child -> find_by_class child cls) children
9191+9292+let extract_story article =
9393+ (* Find h2 > a for title and URL *)
9494+ let title, url =
9595+ match find_descendant_by_tag article "h2" with
9696+ | Some h2 ->
9797+ (match find_child_by_tag h2 "a" with
9898+ | Some a ->
9999+ (Html5rw.get_text_content a,
100100+ Option.value ~default:"#" (Html5rw.get_attr a "href"))
101101+ | None -> (Html5rw.get_text_content h2, "#"))
102102+ | None -> ("(no title)", "#")
103103+ in
104104+ let summary =
105105+ match find_by_class article "summary" with
106106+ | Some p -> Html5rw.get_text_content p
107107+ | None -> ""
108108+ in
109109+ let author =
110110+ match find_by_class article "author" with
111111+ | Some s -> Html5rw.get_text_content s
112112+ | None -> "Unknown"
113113+ in
114114+ let date =
115115+ match find_descendant_by_tag article "time" with
116116+ | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime")
117117+ | None -> ""
118118+ in
119119+ let featured = Html5rw.matches article ".featured" in
120120+ { title; url; summary; author; date; featured }
121121+122122+let () =
123123+ Printf.printf "=== Web Scraping Example ===\n\n";
124124+125125+ let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in
126126+127127+ (* Extract page title *)
128128+ let titles = Html5rw.query result "title" in
129129+ (match titles with
130130+ | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t)
131131+ | [] -> ());
132132+133133+ (* Extract navigation links using descendant query *)
134134+ Printf.printf "Navigation:\n";
135135+ let nav_links = Html5rw.query result "a" in
136136+ let nav = List.filter (fun a ->
137137+ (* Check if this link is in nav by looking at ancestors *)
138138+ List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a)
139139+ ) nav_links in
140140+ List.iter (fun a ->
141141+ let text = Html5rw.get_text_content a in
142142+ let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
143143+ Printf.printf " %s -> %s\n" text href
144144+ ) nav;
145145+146146+ (* Extract stories *)
147147+ Printf.printf "\nStories:\n";
148148+ let articles = Html5rw.query result "article" in
149149+ List.iter (fun article ->
150150+ let story = extract_story article in
151151+ Printf.printf "\n %s%s\n"
152152+ (if story.featured then "[FEATURED] " else "")
153153+ story.title;
154154+ Printf.printf " URL: %s\n" story.url;
155155+ Printf.printf " Summary: %s\n" story.summary;
156156+ Printf.printf " %s | %s\n" story.author story.date
157157+ ) articles;
158158+159159+ (* Extract tags *)
160160+ Printf.printf "\nPopular Tags:\n";
161161+ let all_links = Html5rw.query result "a" in
162162+ let tag_links = List.filter (fun a ->
163163+ let href = Option.value ~default:"" (Html5rw.get_attr a "href") in
164164+ String.length href > 5 && String.sub href 0 5 = "/tag/"
165165+ ) all_links in
166166+ List.iter (fun a ->
167167+ let tag = Html5rw.get_text_content a in
168168+ let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
169169+ Printf.printf " #%s (%s)\n" tag href
170170+ ) tag_links
···11+(* HTML5 DOM node types *)
22+33+type doctype_data = {
44+ name : string option;
55+ public_id : string option;
66+ system_id : string option;
77+}
88+99+type quirks_mode = No_quirks | Quirks | Limited_quirks
1010+1111+type node = {
1212+ mutable name : string;
1313+ mutable namespace : string option; (* None = html, Some "svg", Some "mathml" *)
1414+ mutable attrs : (string * string) list;
1515+ mutable children : node list;
1616+ mutable parent : node option;
1717+ mutable data : string; (* For text, comment nodes *)
1818+ mutable template_content : node option; (* For <template> elements *)
1919+ mutable doctype : doctype_data option; (* For doctype nodes *)
2020+}
2121+2222+(* Node name constants *)
2323+let document_name = "#document"
2424+let document_fragment_name = "#document-fragment"
2525+let text_name = "#text"
2626+let comment_name = "#comment"
2727+let doctype_name = "!doctype"
2828+2929+(* Base node constructor - all nodes share this structure *)
3030+let make_node ~name ?(namespace=None) ?(attrs=[]) ?(data="") ?template_content ?doctype () = {
3131+ name;
3232+ namespace;
3333+ attrs;
3434+ children = [];
3535+ parent = None;
3636+ data;
3737+ template_content;
3838+ doctype;
3939+}
4040+4141+(* Constructors *)
4242+let create_element name ?(namespace=None) ?(attrs=[]) () =
4343+ make_node ~name ~namespace ~attrs ()
4444+4545+let create_text data =
4646+ make_node ~name:text_name ~data ()
4747+4848+let create_comment data =
4949+ make_node ~name:comment_name ~data ()
5050+5151+let create_document () =
5252+ make_node ~name:document_name ()
5353+5454+let create_document_fragment () =
5555+ make_node ~name:document_fragment_name ()
5656+5757+let create_doctype ?name ?public_id ?system_id () =
5858+ make_node ~name:doctype_name ~doctype:{ name; public_id; system_id } ()
5959+6060+let create_template ?(namespace=None) ?(attrs=[]) () =
6161+ let node = create_element "template" ~namespace ~attrs () in
6262+ node.template_content <- Some (create_document_fragment ());
6363+ node
6464+6565+(* Predicates *)
6666+let is_element node =
6767+ not (List.mem node.name [text_name; comment_name; document_name; document_fragment_name; doctype_name])
6868+6969+let is_text node = node.name = text_name
7070+let is_comment node = node.name = comment_name
7171+let is_document node = node.name = document_name
7272+let is_document_fragment node = node.name = document_fragment_name
7373+let is_doctype node = node.name = doctype_name
7474+let has_children node = node.children <> []
7575+7676+(* DOM manipulation *)
7777+let append_child parent child =
7878+ child.parent <- Some parent;
7979+ parent.children <- parent.children @ [child]
8080+8181+let insert_before parent new_child ref_child =
8282+ new_child.parent <- Some parent;
8383+ let rec insert acc = function
8484+ | [] -> List.rev acc @ [new_child]
8585+ | x :: xs when x == ref_child -> List.rev acc @ [new_child; x] @ xs
8686+ | x :: xs -> insert (x :: acc) xs
8787+ in
8888+ parent.children <- insert [] parent.children
8989+9090+let remove_child parent child =
9191+ child.parent <- None;
9292+ parent.children <- List.filter (fun c -> c != child) parent.children
9393+9494+(* Find the last text node before a reference point *)
9595+let last_child_text parent =
9696+ match List.rev parent.children with
9797+ | last :: _ when is_text last -> Some last
9898+ | _ -> None
9999+100100+let insert_text_at parent text before_node =
101101+ match before_node with
102102+ | None ->
103103+ (* Append - merge with last child if it's text *)
104104+ (match last_child_text parent with
105105+ | Some txt -> txt.data <- txt.data ^ text
106106+ | None -> append_child parent (create_text text))
107107+ | Some ref ->
108108+ (* Find last text node before ref_child *)
109109+ let rec find_prev_text = function
110110+ | [] | [_] -> None
111111+ | prev :: curr :: _ when curr == ref && is_text prev -> Some prev
112112+ | _ :: rest -> find_prev_text rest
113113+ in
114114+ match find_prev_text parent.children with
115115+ | Some txt -> txt.data <- txt.data ^ text
116116+ | None -> insert_before parent (create_text text) ref
117117+118118+(* Attribute helpers *)
119119+let get_attr node name = List.assoc_opt name node.attrs
120120+121121+let set_attr node name value =
122122+ node.attrs <- List.filter (fun (n, _) -> n <> name) node.attrs @ [(name, value)]
123123+124124+let has_attr node name = List.mem_assoc name node.attrs
125125+126126+(* Tree traversal *)
127127+let rec descendants node =
128128+ List.concat_map (fun n -> n :: descendants n) node.children
129129+130130+let ancestors node =
131131+ let rec collect acc n =
132132+ match n.parent with
133133+ | None -> List.rev acc
134134+ | Some p -> collect (p :: acc) p
135135+ in
136136+ collect [] node
137137+138138+let rec get_text_content node =
139139+ if is_text node then node.data
140140+ else String.concat "" (List.map get_text_content node.children)
141141+142142+(* Clone *)
143143+let rec clone ?(deep=false) node =
144144+ let new_node = make_node
145145+ ~name:node.name
146146+ ~namespace:node.namespace
147147+ ~attrs:node.attrs
148148+ ~data:node.data
149149+ ?doctype:node.doctype
150150+ ()
151151+ in
152152+ if deep then begin
153153+ new_node.children <- List.map (clone ~deep:true) node.children;
154154+ List.iter (fun c -> c.parent <- Some new_node) new_node.children;
155155+ Option.iter (fun tc ->
156156+ new_node.template_content <- Some (clone ~deep:true tc)
157157+ ) node.template_content
158158+ end;
159159+ new_node
+333
lib/dom/node.mli
···11+(** HTML5 DOM Node Types and Operations
22+33+ This module provides the DOM node representation used by the HTML5 parser.
44+ Nodes form a tree structure representing HTML documents. The type follows
55+ the WHATWG HTML5 specification for document structure.
66+77+ {2 Node Types}
88+99+ The HTML5 DOM includes several node types, all represented by the same
1010+ record type with different field usage:
1111+1212+ - {b Element nodes}: Regular HTML elements like [<div>], [<p>], [<span>]
1313+ - {b Text nodes}: Text content within elements
1414+ - {b Comment nodes}: HTML comments [<!-- comment -->]
1515+ - {b Document nodes}: The root node representing the entire document
1616+ - {b Document fragment nodes}: A lightweight container (used for templates)
1717+ - {b Doctype nodes}: The [<!DOCTYPE html>] declaration
1818+1919+ {2 Namespaces}
2020+2121+ Elements can belong to different namespaces:
2222+ - [None] or [Some "html"]: HTML namespace (default)
2323+ - [Some "svg"]: SVG namespace for embedded SVG content
2424+ - [Some "mathml"]: MathML namespace for mathematical notation
2525+2626+ The parser automatically switches namespaces when encountering [<svg>]
2727+ or [<math>] elements, as specified by the HTML5 algorithm.
2828+2929+ {2 Tree Structure}
3030+3131+ Nodes form a bidirectional tree: each node has a list of children and
3232+ an optional parent reference. Modification functions maintain these
3333+ references automatically.
3434+*)
3535+3636+(** {1 Types} *)
3737+3838+(** Information associated with a DOCTYPE node.
3939+4040+ In HTML5, the DOCTYPE is primarily used for quirks mode detection.
4141+ Most modern HTML5 documents use [<!DOCTYPE html>] which results in
4242+ all fields being [None] or the name being [Some "html"].
4343+4444+ @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
4545+ The WHATWG specification for DOCTYPE handling
4646+*)
4747+type doctype_data = {
4848+ name : string option; (** The DOCTYPE name, e.g., "html" *)
4949+ public_id : string option; (** Public identifier (legacy, rarely used) *)
5050+ system_id : string option; (** System identifier (legacy, rarely used) *)
5151+}
5252+5353+(** Quirks mode setting for the document.
5454+5555+ Quirks mode affects CSS layout behavior for backwards compatibility with
5656+ old web content. The HTML5 parser determines quirks mode based on the
5757+ DOCTYPE declaration.
5858+5959+ - [No_quirks]: Standards mode - full HTML5/CSS3 behavior
6060+ - [Quirks]: Full quirks mode - emulates legacy browser behavior
6161+ - [Limited_quirks]: Almost standards mode - limited quirks for specific cases
6262+6363+ @see <https://quirks.spec.whatwg.org/> The Quirks Mode specification
6464+*)
6565+type quirks_mode = No_quirks | Quirks | Limited_quirks
6666+6767+(** A DOM node in the parsed document tree.
6868+6969+ All node types use the same record structure. The [name] field determines
7070+ the node type:
7171+ - Element: the tag name (e.g., "div", "p")
7272+ - Text: "#text"
7373+ - Comment: "#comment"
7474+ - Document: "#document"
7575+ - Document fragment: "#document-fragment"
7676+ - Doctype: "!doctype"
7777+7878+ {3 Field Usage by Node Type}
7979+8080+ {v
8181+ Node Type | name | namespace | attrs | data | template_content | doctype
8282+ ------------------|------------------|-----------|-------|------|------------------|--------
8383+ Element | tag name | Yes | Yes | No | If <template> | No
8484+ Text | "#text" | No | No | Yes | No | No
8585+ Comment | "#comment" | No | No | Yes | No | No
8686+ Document | "#document" | No | No | No | No | No
8787+ Document Fragment | "#document-frag" | No | No | No | No | No
8888+ Doctype | "!doctype" | No | No | No | No | Yes
8989+ v}
9090+*)
9191+type node = {
9292+ mutable name : string;
9393+ (** Tag name for elements, or special name for other node types *)
9494+9595+ mutable namespace : string option;
9696+ (** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"] *)
9797+9898+ mutable attrs : (string * string) list;
9999+ (** Element attributes as (name, value) pairs *)
100100+101101+ mutable children : node list;
102102+ (** Child nodes in document order *)
103103+104104+ mutable parent : node option;
105105+ (** Parent node, [None] for root nodes *)
106106+107107+ mutable data : string;
108108+ (** Text content for text and comment nodes *)
109109+110110+ mutable template_content : node option;
111111+ (** Document fragment for [<template>] element contents *)
112112+113113+ mutable doctype : doctype_data option;
114114+ (** DOCTYPE information for doctype nodes *)
115115+}
116116+117117+(** {1 Node Name Constants}
118118+119119+ These constants identify special node types. Compare with [node.name]
120120+ to determine the node type.
121121+*)
122122+123123+val document_name : string
124124+(** ["#document"] - name for document nodes *)
125125+126126+val document_fragment_name : string
127127+(** ["#document-fragment"] - name for document fragment nodes *)
128128+129129+val text_name : string
130130+(** ["#text"] - name for text nodes *)
131131+132132+val comment_name : string
133133+(** ["#comment"] - name for comment nodes *)
134134+135135+val doctype_name : string
136136+(** ["!doctype"] - name for doctype nodes *)
137137+138138+(** {1 Constructors}
139139+140140+ Functions to create new DOM nodes. All nodes start with no parent and
141141+ no children.
142142+*)
143143+144144+val create_element : string -> ?namespace:string option ->
145145+ ?attrs:(string * string) list -> unit -> node
146146+(** Create an element node.
147147+148148+ @param name The tag name (e.g., "div", "p", "span")
149149+ @param namespace Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
150150+ @param attrs Initial attributes as (name, value) pairs
151151+152152+ {[
153153+ let div = create_element "div" ()
154154+ let svg = create_element "rect" ~namespace:(Some "svg") ()
155155+ let link = create_element "a" ~attrs:[("href", "/")] ()
156156+ ]}
157157+*)
158158+159159+val create_text : string -> node
160160+(** Create a text node with the given content.
161161+162162+ {[
163163+ let text = create_text "Hello, world!"
164164+ ]}
165165+*)
166166+167167+val create_comment : string -> node
168168+(** Create a comment node with the given content.
169169+170170+ The content should not include the comment delimiters.
171171+172172+ {[
173173+ let comment = create_comment " This is a comment "
174174+ (* Represents: <!-- This is a comment --> *)
175175+ ]}
176176+*)
177177+178178+val create_document : unit -> node
179179+(** Create an empty document node.
180180+181181+ Document nodes are the root of a complete HTML document tree.
182182+*)
183183+184184+val create_document_fragment : unit -> node
185185+(** Create an empty document fragment.
186186+187187+ Document fragments are lightweight containers used for:
188188+ - Template contents
189189+ - Fragment parsing results
190190+ - Efficient batch DOM operations
191191+*)
192192+193193+val create_doctype : ?name:string -> ?public_id:string ->
194194+ ?system_id:string -> unit -> node
195195+(** Create a DOCTYPE node.
196196+197197+ For HTML5, use [create_doctype ~name:"html" ()] which produces
198198+ [<!DOCTYPE html>].
199199+200200+ @param name DOCTYPE name (usually "html")
201201+ @param public_id Public identifier (legacy)
202202+ @param system_id System identifier (legacy)
203203+*)
204204+205205+val create_template : ?namespace:string option ->
206206+ ?attrs:(string * string) list -> unit -> node
207207+(** Create a [<template>] element with its content document fragment.
208208+209209+ Template elements have special semantics: their children are not rendered
210210+ directly but stored in a separate document fragment accessible via
211211+ [template_content].
212212+213213+ @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
214214+ The HTML5 template element specification
215215+*)
216216+217217+(** {1 Node Type Predicates}
218218+219219+ Functions to test what type of node you have.
220220+*)
221221+222222+val is_element : node -> bool
223223+(** [is_element node] returns [true] if the node is an element node.
224224+225225+ Elements are nodes with HTML tags like [<div>], [<p>], etc.
226226+*)
227227+228228+val is_text : node -> bool
229229+(** [is_text node] returns [true] if the node is a text node. *)
230230+231231+val is_comment : node -> bool
232232+(** [is_comment node] returns [true] if the node is a comment node. *)
233233+234234+val is_document : node -> bool
235235+(** [is_document node] returns [true] if the node is a document node. *)
236236+237237+val is_document_fragment : node -> bool
238238+(** [is_document_fragment node] returns [true] if the node is a document fragment. *)
239239+240240+val is_doctype : node -> bool
241241+(** [is_doctype node] returns [true] if the node is a DOCTYPE node. *)
242242+243243+val has_children : node -> bool
244244+(** [has_children node] returns [true] if the node has any children. *)
245245+246246+(** {1 Tree Manipulation}
247247+248248+ Functions to modify the DOM tree structure. These functions automatically
249249+ maintain parent/child references.
250250+*)
251251+252252+val append_child : node -> node -> unit
253253+(** [append_child parent child] adds [child] as the last child of [parent].
254254+255255+ The child's parent reference is updated to point to [parent].
256256+*)
257257+258258+val insert_before : node -> node -> node -> unit
259259+(** [insert_before parent new_child ref_child] inserts [new_child] before
260260+ [ref_child] in [parent]'s children.
261261+262262+ @raise Not_found if [ref_child] is not a child of [parent]
263263+*)
264264+265265+val remove_child : node -> node -> unit
266266+(** [remove_child parent child] removes [child] from [parent]'s children.
267267+268268+ The child's parent reference is set to [None].
269269+*)
270270+271271+val insert_text_at : node -> string -> node option -> unit
272272+(** [insert_text_at parent text before_node] inserts text content.
273273+274274+ If [before_node] is [None], appends at the end. If the previous sibling
275275+ is a text node, the text is merged into it. Otherwise, a new text node
276276+ is created.
277277+278278+ This implements the HTML5 parser's text insertion algorithm which
279279+ coalesces adjacent text nodes.
280280+*)
281281+282282+(** {1 Attribute Operations}
283283+284284+ Functions to read and modify element attributes.
285285+*)
286286+287287+val get_attr : node -> string -> string option
288288+(** [get_attr node name] returns the value of attribute [name], or [None]. *)
289289+290290+val set_attr : node -> string -> string -> unit
291291+(** [set_attr node name value] sets attribute [name] to [value].
292292+293293+ If the attribute already exists, it is replaced.
294294+*)
295295+296296+val has_attr : node -> string -> bool
297297+(** [has_attr node name] returns [true] if the node has attribute [name]. *)
298298+299299+(** {1 Tree Traversal}
300300+301301+ Functions to navigate the DOM tree.
302302+*)
303303+304304+val descendants : node -> node list
305305+(** [descendants node] returns all descendant nodes in document order.
306306+307307+ This performs a depth-first traversal, returning children before
308308+ siblings at each level.
309309+*)
310310+311311+val ancestors : node -> node list
312312+(** [ancestors node] returns all ancestor nodes from parent to root.
313313+314314+ The first element is the immediate parent, the last is the root.
315315+*)
316316+317317+val get_text_content : node -> string
318318+(** [get_text_content node] returns the concatenated text content.
319319+320320+ For text nodes, returns the text data. For elements, recursively
321321+ concatenates all descendant text content.
322322+*)
323323+324324+(** {1 Cloning} *)
325325+326326+val clone : ?deep:bool -> node -> node
327327+(** [clone ?deep node] creates a copy of the node.
328328+329329+ @param deep If [true], recursively clone all descendants (default: [false])
330330+331331+ The cloned node has no parent. Attribute lists are copied by reference
332332+ (the list itself is new, but attribute strings are shared).
333333+*)
+301
lib/dom/serialize.ml
···11+(* HTML5 DOM serialization *)
22+33+open Bytesrw
44+open Node
55+66+(* Void elements that don't have end tags *)
77+let void_elements = [
88+ "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
99+ "link"; "meta"; "source"; "track"; "wbr"
1010+]
1111+1212+let is_void name = List.mem name void_elements
1313+1414+(* Foreign attribute adjustments for test output *)
1515+let foreign_attr_adjustments = [
1616+ "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
1717+ "xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space";
1818+ "xmlns:xlink"
1919+]
2020+2121+(* Escape text content *)
2222+let escape_text text =
2323+ let buf = Buffer.create (String.length text) in
2424+ String.iter (fun c ->
2525+ match c with
2626+ | '&' -> Buffer.add_string buf "&"
2727+ | '<' -> Buffer.add_string buf "<"
2828+ | '>' -> Buffer.add_string buf ">"
2929+ | c -> Buffer.add_char buf c
3030+ ) text;
3131+ Buffer.contents buf
3232+3333+(* Choose quote character for attribute value *)
3434+let choose_attr_quote value =
3535+ if String.contains value '"' && not (String.contains value '\'') then '\''
3636+ else '"'
3737+3838+(* Escape attribute value *)
3939+let escape_attr_value value quote_char =
4040+ let buf = Buffer.create (String.length value) in
4141+ String.iter (fun c ->
4242+ match c with
4343+ | '&' -> Buffer.add_string buf "&"
4444+ | '"' when quote_char = '"' -> Buffer.add_string buf """
4545+ | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
4646+ | c -> Buffer.add_char buf c
4747+ ) value;
4848+ Buffer.contents buf
4949+5050+(* Check if attribute value can be unquoted *)
5151+let can_unquote_attr_value value =
5252+ if String.length value = 0 then false
5353+ else
5454+ let invalid = ref false in
5555+ String.iter (fun c ->
5656+ if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
5757+ c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
5858+ invalid := true
5959+ ) value;
6060+ not !invalid
6161+6262+(* Serialize start tag *)
6363+let serialize_start_tag name attrs =
6464+ let buf = Buffer.create 64 in
6565+ Buffer.add_char buf '<';
6666+ Buffer.add_string buf name;
6767+ List.iter (fun (key, value) ->
6868+ Buffer.add_char buf ' ';
6969+ Buffer.add_string buf key;
7070+ if value <> "" then begin
7171+ if can_unquote_attr_value value then begin
7272+ Buffer.add_char buf '=';
7373+ Buffer.add_string buf (escape_attr_value value '"')
7474+ end else begin
7575+ let quote = choose_attr_quote value in
7676+ Buffer.add_char buf '=';
7777+ Buffer.add_char buf quote;
7878+ Buffer.add_string buf (escape_attr_value value quote);
7979+ Buffer.add_char buf quote
8080+ end
8181+ end
8282+ ) attrs;
8383+ Buffer.add_char buf '>';
8484+ Buffer.contents buf
8585+8686+(* Serialize end tag *)
8787+let serialize_end_tag name =
8888+ "</" ^ name ^ ">"
8989+9090+(* Convert node to HTML string *)
9191+let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node =
9292+ let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
9393+ let newline = if pretty then "\n" else "" in
9494+9595+ match node.name with
9696+ | "#document" ->
9797+ let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in
9898+ String.concat newline (List.filter (fun s -> s <> "") parts)
9999+100100+ | "#document-fragment" ->
101101+ let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in
102102+ String.concat newline (List.filter (fun s -> s <> "") parts)
103103+104104+ | "#text" ->
105105+ let text = node.data in
106106+ if pretty then
107107+ let trimmed = String.trim text in
108108+ if trimmed = "" then ""
109109+ else prefix ^ escape_text trimmed
110110+ else escape_text text
111111+112112+ | "#comment" ->
113113+ prefix ^ "<!--" ^ node.data ^ "-->"
114114+115115+ | "!doctype" ->
116116+ prefix ^ "<!DOCTYPE html>"
117117+118118+ | name ->
119119+ let open_tag = serialize_start_tag name node.attrs in
120120+121121+ if is_void name then
122122+ prefix ^ open_tag
123123+ else if node.children = [] then
124124+ prefix ^ open_tag ^ serialize_end_tag name
125125+ else begin
126126+ (* Check if all children are text *)
127127+ let all_text = List.for_all is_text node.children in
128128+ if all_text && pretty then
129129+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
130130+ prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name
131131+ else begin
132132+ let parts = [prefix ^ open_tag] in
133133+ let child_parts = List.filter_map (fun child ->
134134+ let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in
135135+ if html = "" then None else Some html
136136+ ) node.children in
137137+ let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in
138138+ String.concat newline parts
139139+ end
140140+ end
141141+142142+(* Get qualified name for test format *)
143143+let qualified_name node =
144144+ match node.namespace with
145145+ | Some "svg" -> "svg " ^ node.name
146146+ | Some "mathml" -> "math " ^ node.name
147147+ | Some ns when ns <> "html" -> ns ^ " " ^ node.name
148148+ | _ -> node.name
149149+150150+(* Format attributes for test output *)
151151+let attrs_to_test_format node indent =
152152+ if node.attrs = [] then []
153153+ else begin
154154+ let padding = String.make (indent + 2) ' ' in
155155+ (* Compute display names first, then sort by display name for canonical output *)
156156+ let with_display_names = List.map (fun (name, value) ->
157157+ let display_name =
158158+ match node.namespace with
159159+ | Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments ->
160160+ String.map (fun c -> if c = ':' then ' ' else c) name
161161+ | _ -> name
162162+ in
163163+ (display_name, value)
164164+ ) node.attrs in
165165+ let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in
166166+ List.map (fun (display_name, value) ->
167167+ Printf.sprintf "| %s%s=\"%s\"" padding display_name value
168168+ ) sorted
169169+ end
170170+171171+(* Convert node to html5lib test format *)
172172+let rec to_test_format ?(indent=0) node =
173173+ match node.name with
174174+ | "#document" | "#document-fragment" ->
175175+ let parts = List.map (to_test_format ~indent:0) node.children in
176176+ String.concat "\n" parts
177177+178178+ | "#comment" ->
179179+ Printf.sprintf "| %s<!-- %s -->" (String.make indent ' ') node.data
180180+181181+ | "!doctype" ->
182182+ let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in
183183+ let name_str = match dt.name with Some n -> " " ^ n | None -> " " in
184184+ let ids_str =
185185+ match dt.public_id, dt.system_id with
186186+ | None, None -> ""
187187+ | pub, sys ->
188188+ let pub_str = match pub with Some p -> p | None -> "" in
189189+ let sys_str = match sys with Some s -> s | None -> "" in
190190+ Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str
191191+ in
192192+ Printf.sprintf "| <!DOCTYPE%s%s>" name_str ids_str
193193+194194+ | "#text" ->
195195+ Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data
196196+197197+ | "template" when node.namespace = None || node.namespace = Some "html" ->
198198+ let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
199199+ let attr_lines = attrs_to_test_format node indent in
200200+ let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in
201201+ let content_children =
202202+ match node.template_content with
203203+ | Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children
204204+ | None -> []
205205+ in
206206+ String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children)
207207+208208+ | _ ->
209209+ let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
210210+ let attr_lines = attrs_to_test_format node indent in
211211+ let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in
212212+ String.concat "\n" ([line] @ attr_lines @ child_lines)
213213+214214+(* Extract text content *)
215215+let to_text ?(separator=" ") ?(strip=true) node =
216216+ let rec collect_text n =
217217+ if is_text n then [n.data]
218218+ else List.concat_map collect_text n.children
219219+ in
220220+ let texts = collect_text node in
221221+ let combined = String.concat separator texts in
222222+ if strip then String.trim combined else combined
223223+224224+(* Streaming serialization to a Bytes.Writer.t
225225+ Writes HTML directly to the writer without building intermediate strings *)
226226+let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
227227+ let write s = Bytes.Writer.write_string w s in
228228+ let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
229229+ let write_newline () = if pretty then write "\n" in
230230+231231+ match node.name with
232232+ | "#document" ->
233233+ let rec write_children first = function
234234+ | [] -> ()
235235+ | child :: rest ->
236236+ if not first && pretty then write_newline ();
237237+ to_writer ~pretty ~indent_size ~indent:0 w child;
238238+ write_children false rest
239239+ in
240240+ write_children true node.children
241241+242242+ | "#document-fragment" ->
243243+ let rec write_children first = function
244244+ | [] -> ()
245245+ | child :: rest ->
246246+ if not first && pretty then write_newline ();
247247+ to_writer ~pretty ~indent_size ~indent w child;
248248+ write_children false rest
249249+ in
250250+ write_children true node.children
251251+252252+ | "#text" ->
253253+ let text = node.data in
254254+ if pretty then begin
255255+ let trimmed = String.trim text in
256256+ if trimmed <> "" then begin
257257+ write_prefix ();
258258+ write (escape_text trimmed)
259259+ end
260260+ end else
261261+ write (escape_text text)
262262+263263+ | "#comment" ->
264264+ write_prefix ();
265265+ write "<!--";
266266+ write node.data;
267267+ write "-->"
268268+269269+ | "!doctype" ->
270270+ write_prefix ();
271271+ write "<!DOCTYPE html>"
272272+273273+ | name ->
274274+ write_prefix ();
275275+ write (serialize_start_tag name node.attrs);
276276+277277+ if not (is_void name) then begin
278278+ if node.children = [] then
279279+ write (serialize_end_tag name)
280280+ else begin
281281+ (* Check if all children are text *)
282282+ let all_text = List.for_all is_text node.children in
283283+ if all_text && pretty then begin
284284+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
285285+ write (escape_text text);
286286+ write (serialize_end_tag name)
287287+ end else begin
288288+ let rec write_children = function
289289+ | [] -> ()
290290+ | child :: rest ->
291291+ write_newline ();
292292+ to_writer ~pretty ~indent_size ~indent:(indent + 1) w child;
293293+ write_children rest
294294+ in
295295+ write_children node.children;
296296+ write_newline ();
297297+ write_prefix ();
298298+ write (serialize_end_tag name)
299299+ end
300300+ end
301301+ end
+19
lib/encoding/bom.ml
···11+(* BOM (Byte Order Mark) sniffing *)
22+33+let sniff data =
44+ let len = Bytes.length data in
55+ if len >= 3 &&
66+ Bytes.get data 0 = '\xEF' &&
77+ Bytes.get data 1 = '\xBB' &&
88+ Bytes.get data 2 = '\xBF' then
99+ Some (Encoding.Utf8, 3)
1010+ else if len >= 2 &&
1111+ Bytes.get data 0 = '\xFF' &&
1212+ Bytes.get data 1 = '\xFE' then
1313+ Some (Encoding.Utf16le, 2)
1414+ else if len >= 2 &&
1515+ Bytes.get data 0 = '\xFE' &&
1616+ Bytes.get data 1 = '\xFF' then
1717+ Some (Encoding.Utf16be, 2)
1818+ else
1919+ None
+190
lib/encoding/decode.ml
···11+(* HTML5 encoding detection and decoding *)
22+33+let decode_utf16 data ~is_le ~bom_len =
44+ let len = Bytes.length data in
55+ let buf = Buffer.create len in
66+ let i = ref bom_len in
77+88+ while !i + 1 < len do
99+ let b0 = Char.code (Bytes.get data !i) in
1010+ let b1 = Char.code (Bytes.get data (!i + 1)) in
1111+ let code_unit =
1212+ if is_le then b0 lor (b1 lsl 8)
1313+ else (b0 lsl 8) lor b1
1414+ in
1515+ i := !i + 2;
1616+1717+ (* Handle surrogate pairs *)
1818+ if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
1919+ (* High surrogate, look for low surrogate *)
2020+ let b2 = Char.code (Bytes.get data !i) in
2121+ let b3 = Char.code (Bytes.get data (!i + 1)) in
2222+ let code_unit2 =
2323+ if is_le then b2 lor (b3 lsl 8)
2424+ else (b2 lsl 8) lor b3
2525+ in
2626+ if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
2727+ i := !i + 2;
2828+ let high = code_unit - 0xD800 in
2929+ let low = code_unit2 - 0xDC00 in
3030+ let cp = 0x10000 + (high lsl 10) lor low in
3131+ Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
3232+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
3333+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
3434+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
3535+ end else begin
3636+ (* Invalid surrogate, output replacement *)
3737+ Buffer.add_string buf "\xEF\xBF\xBD"
3838+ end
3939+ end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
4040+ (* Lone surrogate *)
4141+ Buffer.add_string buf "\xEF\xBF\xBD"
4242+ end else if code_unit <= 0x7F then begin
4343+ Buffer.add_char buf (Char.chr code_unit)
4444+ end else if code_unit <= 0x7FF then begin
4545+ Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6)));
4646+ Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
4747+ end else begin
4848+ Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12)));
4949+ Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F)));
5050+ Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
5151+ end
5252+ done;
5353+5454+ (* Odd trailing byte *)
5555+ if !i < len then Buffer.add_string buf "\xEF\xBF\xBD";
5656+5757+ Buffer.contents buf
5858+5959+let decode_with_encoding data enc ~bom_len =
6060+ match enc with
6161+ | Encoding.Utf8 ->
6262+ (* UTF-8: Just validate and replace errors with replacement character *)
6363+ let len = Bytes.length data in
6464+ let buf = Buffer.create len in
6565+ let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
6666+ (* Skip BOM if present *)
6767+ let _ =
6868+ if bom_len > 0 then begin
6969+ for _ = 1 to bom_len do
7070+ ignore (Uutf.decode decoder)
7171+ done
7272+ end
7373+ in
7474+ let rec loop () =
7575+ match Uutf.decode decoder with
7676+ | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
7777+ | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
7878+ | `End -> ()
7979+ | `Await -> assert false
8080+ in
8181+ loop ();
8282+ Buffer.contents buf
8383+8484+ | Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
8585+ | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
8686+8787+ | Encoding.Windows_1252 ->
8888+ let len = Bytes.length data in
8989+ let buf = Buffer.create len in
9090+ let table = [|
9191+ (* 0x80-0x9F *)
9292+ 0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
9393+ 0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
9494+ 0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
9595+ 0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
9696+ |] in
9797+ for i = bom_len to len - 1 do
9898+ let b = Char.code (Bytes.get data i) in
9999+ let cp =
100100+ if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
101101+ else b
102102+ in
103103+ if cp <= 0x7F then
104104+ Buffer.add_char buf (Char.chr cp)
105105+ else if cp <= 0x7FF then begin
106106+ Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
107107+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
108108+ end else begin
109109+ Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
110110+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
111111+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
112112+ end
113113+ done;
114114+ Buffer.contents buf
115115+116116+ | Encoding.Iso_8859_2 ->
117117+ let len = Bytes.length data in
118118+ let buf = Buffer.create len in
119119+ let table = [|
120120+ (* 0xA0-0xBF *)
121121+ 0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7;
122122+ 0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B;
123123+ 0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7;
124124+ 0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C;
125125+ (* 0xC0-0xFF *)
126126+ 0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7;
127127+ 0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E;
128128+ 0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7;
129129+ 0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF;
130130+ 0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7;
131131+ 0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F;
132132+ 0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7;
133133+ 0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9;
134134+ |] in
135135+ for i = bom_len to len - 1 do
136136+ let b = Char.code (Bytes.get data i) in
137137+ let cp =
138138+ if b >= 0xA0 then table.(b - 0xA0)
139139+ else b
140140+ in
141141+ if cp <= 0x7F then
142142+ Buffer.add_char buf (Char.chr cp)
143143+ else if cp <= 0x7FF then begin
144144+ Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
145145+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
146146+ end else begin
147147+ Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
148148+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
149149+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
150150+ end
151151+ done;
152152+ Buffer.contents buf
153153+154154+ | Encoding.Euc_jp ->
155155+ (* For EUC-JP, use uutf with best effort *)
156156+ let len = Bytes.length data in
157157+ let buf = Buffer.create len in
158158+ let s = Bytes.sub_string data bom_len (len - bom_len) in
159159+ (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
160160+ (* This is a simplification - full EUC-JP would need a separate decoder *)
161161+ String.iter (fun c ->
162162+ if Char.code c <= 0x7F then
163163+ Buffer.add_char buf c
164164+ else
165165+ Buffer.add_string buf "\xEF\xBF\xBD"
166166+ ) s;
167167+ Buffer.contents buf
168168+169169+let decode data ?transport_encoding () =
170170+ (* Step 1: Check for BOM *)
171171+ let bom_result = Bom.sniff data in
172172+ match bom_result with
173173+ | Some (enc, bom_len) ->
174174+ (decode_with_encoding data enc ~bom_len, enc)
175175+ | None ->
176176+ (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
177177+ let enc_from_transport =
178178+ match transport_encoding with
179179+ | Some te -> Labels.normalize_label te
180180+ | None -> None
181181+ in
182182+ match enc_from_transport with
183183+ | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
184184+ | None ->
185185+ (* Step 3: Prescan for meta charset *)
186186+ match Prescan.prescan_for_meta_charset data with
187187+ | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
188188+ | None ->
189189+ (* Default to UTF-8 *)
190190+ (decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8)
···11+(* Encoding label normalization per WHATWG Encoding Standard *)
22+33+let normalize_label label =
44+ if String.length label = 0 then None
55+ else
66+ let s = String.lowercase_ascii (String.trim label) in
77+ if String.length s = 0 then None
88+ else
99+ (* Security: never allow utf-7 *)
1010+ if s = "utf-7" || s = "utf7" || s = "x-utf-7" then
1111+ Some Encoding.Windows_1252
1212+ else if s = "utf-8" || s = "utf8" then
1313+ Some Encoding.Utf8
1414+ (* HTML treats latin-1 labels as windows-1252 *)
1515+ else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" ||
1616+ s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then
1717+ Some Encoding.Windows_1252
1818+ else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then
1919+ Some Encoding.Windows_1252
2020+ else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then
2121+ Some Encoding.Iso_8859_2
2222+ else if s = "euc-jp" || s = "eucjp" then
2323+ Some Encoding.Euc_jp
2424+ else if s = "utf-16" || s = "utf16" then
2525+ Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *)
2626+ else if s = "utf-16le" || s = "utf16le" then
2727+ Some Encoding.Utf16le
2828+ else if s = "utf-16be" || s = "utf16be" then
2929+ Some Encoding.Utf16be
3030+ else
3131+ None
3232+3333+let normalize_meta_declared label =
3434+ match normalize_label label with
3535+ | None -> None
3636+ | Some enc ->
3737+ (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
3838+ treat them as UTF-8 *)
3939+ match enc with
4040+ | Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8
4141+ | other -> Some other
+268
lib/encoding/prescan.ml
···11+(* HTML meta charset prescan per WHATWG spec *)
22+33+let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20']
44+55+let is_ascii_whitespace c = List.mem c ascii_whitespace
66+77+let is_ascii_alpha c =
88+ (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
99+1010+let ascii_lower c =
1111+ if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32)
1212+ else c
1313+1414+let skip_whitespace data i len =
1515+ let j = ref i in
1616+ while !j < len && is_ascii_whitespace (Bytes.get data !j) do
1717+ incr j
1818+ done;
1919+ !j
2020+2121+let strip_whitespace data start len =
2222+ let s = ref start in
2323+ let e = ref (start + len) in
2424+ while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done;
2525+ while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done;
2626+ Bytes.sub_string data !s (!e - !s)
2727+2828+let extract_charset_from_content content =
2929+ let len = String.length content in
3030+ (* Find "charset" *)
3131+ let rec find_charset i =
3232+ if i + 7 > len then None
3333+ else
3434+ let sub = String.lowercase_ascii (String.sub content i 7) in
3535+ if sub = "charset" then
3636+ let j = ref (i + 7) in
3737+ (* Skip whitespace *)
3838+ while !j < len && is_ascii_whitespace content.[!j] do incr j done;
3939+ if !j >= len || content.[!j] <> '=' then find_charset (i + 1)
4040+ else begin
4141+ incr j;
4242+ (* Skip whitespace after = *)
4343+ while !j < len && is_ascii_whitespace content.[!j] do incr j done;
4444+ if !j >= len then None
4545+ else
4646+ let quote =
4747+ if content.[!j] = '"' || content.[!j] = '\'' then begin
4848+ let q = content.[!j] in
4949+ incr j;
5050+ Some q
5151+ end else None
5252+ in
5353+ let start = !j in
5454+ (match quote with
5555+ | Some q ->
5656+ while !j < len && content.[!j] <> q do incr j done;
5757+ if !j >= len then None
5858+ else Some (String.sub content start (!j - start))
5959+ | None ->
6060+ while !j < len &&
6161+ not (is_ascii_whitespace content.[!j]) &&
6262+ content.[!j] <> ';' do
6363+ incr j
6464+ done;
6565+ Some (String.sub content start (!j - start)))
6666+ end
6767+ else find_charset (i + 1)
6868+ in
6969+ find_charset 0
7070+7171+let prescan_for_meta_charset data =
7272+ let len = Bytes.length data in
7373+ let max_non_comment = 1024 in
7474+ let max_total = 65536 in
7575+ let i = ref 0 in
7676+ let non_comment = ref 0 in
7777+7878+ let result = ref None in
7979+8080+ while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do
8181+ if Bytes.get data !i <> '<' then begin
8282+ incr i;
8383+ incr non_comment
8484+ end else begin
8585+ (* Check for comment *)
8686+ if !i + 3 < len &&
8787+ Bytes.get data (!i + 1) = '!' &&
8888+ Bytes.get data (!i + 2) = '-' &&
8989+ Bytes.get data (!i + 3) = '-' then begin
9090+ (* Skip comment *)
9191+ let j = ref (!i + 4) in
9292+ while !j + 2 < len && not (
9393+ Bytes.get data !j = '-' &&
9494+ Bytes.get data (!j + 1) = '-' &&
9595+ Bytes.get data (!j + 2) = '>'
9696+ ) do incr j done;
9797+ if !j + 2 < len then
9898+ i := !j + 3
9999+ else
100100+ result := None (* Unclosed comment, stop scanning *)
101101+ end
102102+ (* Check for end tag - skip it *)
103103+ else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin
104104+ let j = ref (!i + 2) in
105105+ let in_quote = ref None in
106106+ while !j < len && !j < max_total && !non_comment < max_non_comment do
107107+ let c = Bytes.get data !j in
108108+ match !in_quote with
109109+ | None ->
110110+ if c = '"' || c = '\'' then begin
111111+ in_quote := Some c;
112112+ incr j;
113113+ incr non_comment
114114+ end else if c = '>' then begin
115115+ incr j;
116116+ incr non_comment;
117117+ j := len (* Exit loop *)
118118+ end else begin
119119+ incr j;
120120+ incr non_comment
121121+ end
122122+ | Some q ->
123123+ if c = q then in_quote := None;
124124+ incr j;
125125+ incr non_comment
126126+ done;
127127+ i := !j
128128+ end
129129+ (* Check for tag *)
130130+ else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin
131131+ let j = ref (!i + 1) in
132132+ while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done;
133133+ let tag_name =
134134+ let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in
135135+ String.lowercase_ascii (Bytes.to_string name_bytes)
136136+ in
137137+138138+ if tag_name <> "meta" then begin
139139+ (* Skip non-meta tag *)
140140+ let in_quote = ref None in
141141+ while !j < len && !j < max_total && !non_comment < max_non_comment do
142142+ let c = Bytes.get data !j in
143143+ match !in_quote with
144144+ | None ->
145145+ if c = '"' || c = '\'' then begin
146146+ in_quote := Some c;
147147+ incr j;
148148+ incr non_comment
149149+ end else if c = '>' then begin
150150+ incr j;
151151+ incr non_comment;
152152+ j := len
153153+ end else begin
154154+ incr j;
155155+ incr non_comment
156156+ end
157157+ | Some q ->
158158+ if c = q then in_quote := None;
159159+ incr j;
160160+ incr non_comment
161161+ done;
162162+ i := !j
163163+ end else begin
164164+ (* Parse meta tag attributes *)
165165+ let charset = ref None in
166166+ let http_equiv = ref None in
167167+ let content = ref None in
168168+ let k = ref !j in
169169+ let saw_gt = ref false in
170170+171171+ while not !saw_gt && !k < len && !k < max_total do
172172+ let c = Bytes.get data !k in
173173+ if c = '>' then begin
174174+ saw_gt := true;
175175+ incr k
176176+ end else if c = '<' then begin
177177+ (* Restart scanning from here *)
178178+ k := len
179179+ end else if is_ascii_whitespace c || c = '/' then begin
180180+ incr k
181181+ end else begin
182182+ (* Attribute name *)
183183+ let attr_start = !k in
184184+ while !k < len &&
185185+ not (is_ascii_whitespace (Bytes.get data !k)) &&
186186+ Bytes.get data !k <> '=' &&
187187+ Bytes.get data !k <> '>' &&
188188+ Bytes.get data !k <> '/' &&
189189+ Bytes.get data !k <> '<' do
190190+ incr k
191191+ done;
192192+ let attr_name =
193193+ String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start))
194194+ in
195195+ k := skip_whitespace data !k len;
196196+197197+ let value = ref None in
198198+ if !k < len && Bytes.get data !k = '=' then begin
199199+ incr k;
200200+ k := skip_whitespace data !k len;
201201+ if !k < len then begin
202202+ let qc = Bytes.get data !k in
203203+ if qc = '"' || qc = '\'' then begin
204204+ incr k;
205205+ let val_start = !k in
206206+ while !k < len && Bytes.get data !k <> qc do incr k done;
207207+ if !k < len then begin
208208+ value := Some (Bytes.sub_string data val_start (!k - val_start));
209209+ incr k
210210+ end
211211+ end else begin
212212+ let val_start = !k in
213213+ while !k < len &&
214214+ not (is_ascii_whitespace (Bytes.get data !k)) &&
215215+ Bytes.get data !k <> '>' &&
216216+ Bytes.get data !k <> '<' do
217217+ incr k
218218+ done;
219219+ value := Some (Bytes.sub_string data val_start (!k - val_start))
220220+ end
221221+ end
222222+ end;
223223+224224+ if attr_name = "charset" then
225225+ charset := !value
226226+ else if attr_name = "http-equiv" then
227227+ http_equiv := !value
228228+ else if attr_name = "content" then
229229+ content := !value
230230+ end
231231+ done;
232232+233233+ if !saw_gt then begin
234234+ (* Check for charset *)
235235+ (match !charset with
236236+ | Some cs ->
237237+ (match Labels.normalize_meta_declared cs with
238238+ | Some enc -> result := Some enc
239239+ | None -> ())
240240+ | None -> ());
241241+242242+ (* Check for http-equiv="content-type" with content *)
243243+ if !result = None then
244244+ (match !http_equiv, !content with
245245+ | Some he, Some ct when String.lowercase_ascii he = "content-type" ->
246246+ (match extract_charset_from_content ct with
247247+ | Some extracted ->
248248+ (match Labels.normalize_meta_declared extracted with
249249+ | Some enc -> result := Some enc
250250+ | None -> ())
251251+ | None -> ())
252252+ | _ -> ());
253253+254254+ i := !k;
255255+ non_comment := !non_comment + (!k - !j)
256256+ end else begin
257257+ incr i;
258258+ incr non_comment
259259+ end
260260+ end
261261+ end else begin
262262+ incr i;
263263+ incr non_comment
264264+ end
265265+ end
266266+ done;
267267+268268+ !result
+192
lib/entities/decode.ml
···11+(* HTML5 entity decoding *)
22+33+let is_alpha c =
44+ (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
55+66+let is_alnum c =
77+ is_alpha c || (c >= '0' && c <= '9')
88+99+let is_hex_digit c =
1010+ (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
1111+1212+let is_digit c =
1313+ c >= '0' && c <= '9'
1414+1515+let decode_entities_in_text text ~in_attribute =
1616+ let len = String.length text in
1717+ let buf = Buffer.create len in
1818+ let i = ref 0 in
1919+2020+ while !i < len do
2121+ (* Find next ampersand *)
2222+ let amp_pos =
2323+ try Some (String.index_from text !i '&')
2424+ with Not_found -> None
2525+ in
2626+2727+ match amp_pos with
2828+ | None ->
2929+ (* No more ampersands, append rest *)
3030+ Buffer.add_substring buf text !i (len - !i);
3131+ i := len
3232+ | Some amp ->
3333+ (* Append text before ampersand *)
3434+ if amp > !i then
3535+ Buffer.add_substring buf text !i (amp - !i);
3636+3737+ i := amp;
3838+ let j = ref (amp + 1) in
3939+4040+ if !j >= len then begin
4141+ (* Ampersand at end *)
4242+ Buffer.add_char buf '&';
4343+ i := len
4444+ end else if text.[!j] = '#' then begin
4545+ (* Numeric entity *)
4646+ incr j;
4747+ let is_hex =
4848+ if !j < len && (text.[!j] = 'x' || text.[!j] = 'X') then begin
4949+ incr j;
5050+ true
5151+ end else false
5252+ in
5353+5454+ let digit_start = !j in
5555+ if is_hex then
5656+ while !j < len && is_hex_digit text.[!j] do incr j done
5757+ else
5858+ while !j < len && is_digit text.[!j] do incr j done;
5959+6060+ let has_semicolon = !j < len && text.[!j] = ';' in
6161+ let digit_text = String.sub text digit_start (!j - digit_start) in
6262+6363+ if String.length digit_text > 0 then begin
6464+ match Numeric_ref.decode digit_text ~is_hex with
6565+ | Some decoded ->
6666+ Buffer.add_string buf decoded;
6767+ i := if has_semicolon then !j + 1 else !j
6868+ | None ->
6969+ (* Invalid numeric entity, keep as-is *)
7070+ let end_pos = if has_semicolon then !j + 1 else !j in
7171+ Buffer.add_substring buf text amp (end_pos - amp);
7272+ i := end_pos
7373+ end else begin
7474+ (* No digits, keep as-is *)
7575+ let end_pos = if has_semicolon then !j + 1 else !j in
7676+ Buffer.add_substring buf text amp (end_pos - amp);
7777+ i := end_pos
7878+ end
7979+ end else begin
8080+ (* Named entity *)
8181+ (* Collect alphanumeric characters *)
8282+ while !j < len && is_alnum text.[!j] do incr j done;
8383+8484+ let entity_name = String.sub text (amp + 1) (!j - amp - 1) in
8585+ let has_semicolon = !j < len && text.[!j] = ';' in
8686+8787+ if String.length entity_name = 0 then begin
8888+ Buffer.add_char buf '&';
8989+ i := amp + 1
9090+ end else begin
9191+ (* Try exact match first (with semicolon expected) *)
9292+ let decoded =
9393+ if has_semicolon then
9494+ Entity_table.lookup entity_name
9595+ else
9696+ None
9797+ in
9898+9999+ match decoded with
100100+ | Some value ->
101101+ Buffer.add_string buf value;
102102+ i := !j + 1
103103+ | None ->
104104+ (* If semicolon present but no exact match, try legacy prefix match in text *)
105105+ if has_semicolon && not in_attribute then begin
106106+ (* Try progressively shorter prefixes *)
107107+ let rec try_prefix k =
108108+ if k <= 0 then None
109109+ else
110110+ let prefix = String.sub entity_name 0 k in
111111+ if Entity_table.is_legacy prefix then
112112+ match Entity_table.lookup prefix with
113113+ | Some value -> Some (value, k)
114114+ | None -> try_prefix (k - 1)
115115+ else
116116+ try_prefix (k - 1)
117117+ in
118118+ match try_prefix (String.length entity_name) with
119119+ | Some (value, matched_len) ->
120120+ Buffer.add_string buf value;
121121+ i := amp + 1 + matched_len
122122+ | None ->
123123+ (* No match, keep as-is including semicolon *)
124124+ Buffer.add_substring buf text amp (!j + 1 - amp);
125125+ i := !j + 1
126126+ end else if not has_semicolon then begin
127127+ (* Try without semicolon for legacy compatibility *)
128128+ if Entity_table.is_legacy entity_name then
129129+ match Entity_table.lookup entity_name with
130130+ | Some value ->
131131+ (* Legacy entities without semicolon have strict rules in attributes *)
132132+ let next_char = if !j < len then Some text.[!j] else None in
133133+ let blocked =
134134+ in_attribute &&
135135+ match next_char with
136136+ | Some c -> is_alnum c || c = '='
137137+ | None -> false
138138+ in
139139+ if blocked then begin
140140+ Buffer.add_char buf '&';
141141+ i := amp + 1
142142+ end else begin
143143+ Buffer.add_string buf value;
144144+ i := !j
145145+ end
146146+ | None ->
147147+ Buffer.add_char buf '&';
148148+ i := amp + 1
149149+ else begin
150150+ (* Try longest prefix match for legacy entities *)
151151+ let rec try_prefix k =
152152+ if k <= 0 then None
153153+ else
154154+ let prefix = String.sub entity_name 0 k in
155155+ if Entity_table.is_legacy prefix then
156156+ match Entity_table.lookup prefix with
157157+ | Some value -> Some (value, k)
158158+ | None -> try_prefix (k - 1)
159159+ else
160160+ try_prefix (k - 1)
161161+ in
162162+ match try_prefix (String.length entity_name) with
163163+ | Some (value, matched_len) ->
164164+ let end_pos = amp + 1 + matched_len in
165165+ let next_char = if end_pos < len then Some text.[end_pos] else None in
166166+ let blocked =
167167+ in_attribute &&
168168+ match next_char with
169169+ | Some c -> is_alnum c || c = '='
170170+ | None -> false
171171+ in
172172+ if blocked then begin
173173+ Buffer.add_char buf '&';
174174+ i := amp + 1
175175+ end else begin
176176+ Buffer.add_string buf value;
177177+ i := end_pos
178178+ end
179179+ | None ->
180180+ Buffer.add_char buf '&';
181181+ i := amp + 1
182182+ end
183183+ end else begin
184184+ (* Has semicolon but no match *)
185185+ Buffer.add_substring buf text amp (!j + 1 - amp);
186186+ i := !j + 1
187187+ end
188188+ end
189189+ end
190190+ done;
191191+192192+ Buffer.contents buf
···11+(* HTML5 numeric character reference decoding *)
22+33+(* HTML5 spec: numeric character reference replacements (§13.2.5.73) *)
44+let numeric_replacements = [|
55+ (0x00, 0xFFFD); (* NULL -> REPLACEMENT CHARACTER *)
66+ (0x80, 0x20AC); (* -> EURO SIGN *)
77+ (0x82, 0x201A); (* -> SINGLE LOW-9 QUOTATION MARK *)
88+ (0x83, 0x0192); (* -> LATIN SMALL LETTER F WITH HOOK *)
99+ (0x84, 0x201E); (* -> DOUBLE LOW-9 QUOTATION MARK *)
1010+ (0x85, 0x2026); (* -> HORIZONTAL ELLIPSIS *)
1111+ (0x86, 0x2020); (* -> DAGGER *)
1212+ (0x87, 0x2021); (* -> DOUBLE DAGGER *)
1313+ (0x88, 0x02C6); (* -> MODIFIER LETTER CIRCUMFLEX ACCENT *)
1414+ (0x89, 0x2030); (* -> PER MILLE SIGN *)
1515+ (0x8A, 0x0160); (* -> LATIN CAPITAL LETTER S WITH CARON *)
1616+ (0x8B, 0x2039); (* -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK *)
1717+ (0x8C, 0x0152); (* -> LATIN CAPITAL LIGATURE OE *)
1818+ (0x8E, 0x017D); (* -> LATIN CAPITAL LETTER Z WITH CARON *)
1919+ (0x91, 0x2018); (* -> LEFT SINGLE QUOTATION MARK *)
2020+ (0x92, 0x2019); (* -> RIGHT SINGLE QUOTATION MARK *)
2121+ (0x93, 0x201C); (* -> LEFT DOUBLE QUOTATION MARK *)
2222+ (0x94, 0x201D); (* -> RIGHT DOUBLE QUOTATION MARK *)
2323+ (0x95, 0x2022); (* -> BULLET *)
2424+ (0x96, 0x2013); (* -> EN DASH *)
2525+ (0x97, 0x2014); (* -> EM DASH *)
2626+ (0x98, 0x02DC); (* -> SMALL TILDE *)
2727+ (0x99, 0x2122); (* -> TRADE MARK SIGN *)
2828+ (0x9A, 0x0161); (* -> LATIN SMALL LETTER S WITH CARON *)
2929+ (0x9B, 0x203A); (* -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK *)
3030+ (0x9C, 0x0153); (* -> LATIN SMALL LIGATURE OE *)
3131+ (0x9E, 0x017E); (* -> LATIN SMALL LETTER Z WITH CARON *)
3232+ (0x9F, 0x0178); (* -> LATIN CAPITAL LETTER Y WITH DIAERESIS *)
3333+|]
3434+3535+let find_replacement cp =
3636+ let rec search i =
3737+ if i >= Array.length numeric_replacements then None
3838+ else
3939+ let (k, v) = numeric_replacements.(i) in
4040+ if k = cp then Some v
4141+ else if k > cp then None
4242+ else search (i + 1)
4343+ in
4444+ search 0
4545+4646+let codepoint_to_utf8 cp =
4747+ let buf = Buffer.create 4 in
4848+ if cp <= 0x7F then
4949+ Buffer.add_char buf (Char.chr cp)
5050+ else if cp <= 0x7FF then begin
5151+ Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
5252+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
5353+ end else if cp <= 0xFFFF then begin
5454+ Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
5555+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
5656+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
5757+ end else begin
5858+ Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
5959+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
6060+ Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
6161+ Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
6262+ end;
6363+ Buffer.contents buf
6464+6565+let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
6666+6767+let decode text ~is_hex =
6868+ match int_of_string_opt ((if is_hex then "0x" else "") ^ text) with
6969+ | None -> None
7070+ | Some cp ->
7171+ (* Apply HTML5 replacements *)
7272+ let cp = match find_replacement cp with
7373+ | Some replacement -> replacement
7474+ | None -> cp
7575+ in
7676+ (* Invalid ranges per HTML5 spec *)
7777+ if cp > 0x10FFFF then
7878+ Some replacement_char
7979+ else if cp >= 0xD800 && cp <= 0xDFFF then
8080+ (* Surrogate range *)
8181+ Some replacement_char
8282+ else if cp = 0 then
8383+ Some replacement_char
8484+ else
8585+ Some (codepoint_to_utf8 cp)
···11+(** Html5rw - Pure OCaml HTML5 Parser
22+33+ This module provides a complete HTML5 parsing solution following the
44+ WHATWG specification. It uses bytesrw for streaming input/output.
55+66+ {2 Quick Start}
77+88+ Parse HTML from a reader:
99+ {[
1010+ open Bytesrw
1111+ let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
1212+ let result = Html5rw.parse reader in
1313+ let html = Html5rw.to_string result
1414+ ]}
1515+1616+ Parse from a file:
1717+ {[
1818+ open Bytesrw
1919+ let ic = open_in "page.html" in
2020+ let reader = Bytes.Reader.of_in_channel ic in
2121+ let result = Html5rw.parse reader in
2222+ close_in ic
2323+ ]}
2424+2525+ Query with CSS selectors:
2626+ {[
2727+ let result = Html5rw.parse reader in
2828+ let divs = Html5rw.query result "div.content"
2929+ ]}
3030+*)
3131+3232+(** {1 Sub-modules} *)
3333+3434+(** DOM types and manipulation functions *)
3535+module Dom = Html5rw_dom
3636+3737+(** HTML5 tokenizer *)
3838+module Tokenizer = Html5rw_tokenizer
3939+4040+(** Encoding detection and decoding *)
4141+module Encoding = Html5rw_encoding
4242+4343+(** CSS selector engine *)
4444+module Selector = Html5rw_selector
4545+4646+(** HTML entity decoding *)
4747+module Entities = Html5rw_entities
4848+4949+(** Low-level parser access *)
5050+module Parser = Html5rw_parser
5151+5252+(** {1 Core Types} *)
5353+5454+(** DOM node type. See {!Dom} for manipulation functions. *)
5555+type node = Dom.node
5656+5757+(** Doctype information *)
5858+type doctype_data = Dom.doctype_data = {
5959+ name : string option;
6060+ public_id : string option;
6161+ system_id : string option;
6262+}
6363+6464+(** Quirks mode as determined during parsing *)
6565+type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
6666+6767+(** Character encoding detected or specified *)
6868+type encoding = Encoding.encoding =
6969+ | Utf8
7070+ | Utf16le
7171+ | Utf16be
7272+ | Windows_1252
7373+ | Iso_8859_2
7474+ | Euc_jp
7575+7676+(** Parse error record *)
7777+type parse_error = Parser.parse_error
7878+7979+(** Fragment parsing context *)
8080+type fragment_context = Parser.fragment_context
8181+8282+(** Create a fragment parsing context.
8383+ @param tag_name Tag name of the context element
8484+ @param namespace Namespace (None for HTML, Some "svg", Some "mathml")
8585+*)
8686+let make_fragment_context = Parser.make_fragment_context
8787+8888+(** Get the tag name from a fragment context *)
8989+let fragment_context_tag = Parser.fragment_context_tag
9090+9191+(** Get the namespace from a fragment context *)
9292+let fragment_context_namespace = Parser.fragment_context_namespace
9393+9494+(** Get the error code string *)
9595+let error_code = Parser.error_code
9696+9797+(** Get the line number of an error (1-indexed) *)
9898+let error_line = Parser.error_line
9999+100100+(** Get the column number of an error (1-indexed) *)
101101+let error_column = Parser.error_column
102102+103103+(** Result of parsing an HTML document *)
104104+type t = {
105105+ root : node;
106106+ errors : parse_error list;
107107+ encoding : encoding option;
108108+}
109109+110110+(* Internal: convert Parser.t to our t *)
111111+let of_parser_result (p : Parser.t) : t =
112112+ { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p }
113113+114114+(** {1 Parsing Functions} *)
115115+116116+(** Parse HTML from a [Bytes.Reader.t].
117117+118118+ This is the primary parsing function. Create a reader from any source:
119119+ - [Bytes.Reader.of_string s] for strings
120120+ - [Bytes.Reader.of_in_channel ic] for files
121121+ - [Bytes.Reader.of_bytes b] for byte buffers
122122+123123+ {[
124124+ open Bytesrw
125125+ let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
126126+ let result = Html5rw.parse reader
127127+ ]}
128128+129129+ @param collect_errors If true, collect parse errors (default: false)
130130+ @param fragment_context Context element for fragment parsing
131131+*)
132132+let parse ?collect_errors ?fragment_context reader =
133133+ of_parser_result (Parser.parse ?collect_errors ?fragment_context reader)
134134+135135+(** Parse raw bytes with automatic encoding detection.
136136+137137+ This function implements the WHATWG encoding sniffing algorithm:
138138+ 1. Check for BOM (Byte Order Mark)
139139+ 2. Prescan for <meta charset>
140140+ 3. Fall back to UTF-8
141141+142142+ @param collect_errors If true, collect parse errors (default: false)
143143+ @param transport_encoding Encoding from HTTP Content-Type header
144144+ @param fragment_context Context element for fragment parsing
145145+*)
146146+let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes =
147147+ of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes)
148148+149149+(** {1 Querying} *)
150150+151151+(** Query the DOM tree with a CSS selector.
152152+153153+ Supported selectors:
154154+ - Tag: [div], [p], [span]
155155+ - ID: [#myid]
156156+ - Class: [.myclass]
157157+ - Universal: [*]
158158+ - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
159159+ - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
160160+ - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
161161+162162+ {[
163163+ let divs = Html5rw.query result "div.content > p"
164164+ ]}
165165+166166+ @raise Selector.Selector_error if the selector is invalid
167167+*)
168168+let query t selector = Selector.query t.root selector
169169+170170+(** Check if a node matches a CSS selector. *)
171171+let matches node selector = Selector.matches node selector
172172+173173+(** {1 Serialization} *)
174174+175175+(** Write the DOM tree to a [Bytes.Writer.t].
176176+177177+ {[
178178+ open Bytesrw
179179+ let buf = Buffer.create 1024 in
180180+ let writer = Bytes.Writer.of_buffer buf in
181181+ Html5rw.to_writer result writer;
182182+ Bytes.Writer.write_eod writer;
183183+ let html = Buffer.contents buf
184184+ ]}
185185+186186+ @param pretty If true, format with indentation (default: true)
187187+ @param indent_size Number of spaces per indent level (default: 2)
188188+*)
189189+let to_writer ?pretty ?indent_size t writer =
190190+ Dom.to_writer ?pretty ?indent_size writer t.root
191191+192192+(** Serialize the DOM tree to a string.
193193+194194+ Convenience function when the output fits in memory.
195195+196196+ @param pretty If true, format with indentation (default: true)
197197+ @param indent_size Number of spaces per indent level (default: 2)
198198+*)
199199+let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root
200200+201201+(** Extract text content from the DOM tree.
202202+203203+ @param separator String to insert between text nodes (default: " ")
204204+ @param strip If true, trim whitespace (default: true)
205205+*)
206206+let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root
207207+208208+(** Serialize to html5lib test format (for testing). *)
209209+let to_test_format t = Dom.to_test_format t.root
210210+211211+(** {1 Result Accessors} *)
212212+213213+(** Get the root node of the parsed document. *)
214214+let root t = t.root
215215+216216+(** Get parse errors (if error collection was enabled). *)
217217+let errors t = t.errors
218218+219219+(** Get the detected encoding (if parsed from bytes). *)
220220+let encoding t = t.encoding
221221+222222+(** {1 DOM Utilities}
223223+224224+ Common DOM operations are available directly. For the full API,
225225+ see the {!Dom} module.
226226+*)
227227+228228+(** Create an element node.
229229+ @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
230230+ @param attrs List of (name, value) attribute pairs
231231+*)
232232+let create_element = Dom.create_element
233233+234234+(** Create a text node. *)
235235+let create_text = Dom.create_text
236236+237237+(** Create a comment node. *)
238238+let create_comment = Dom.create_comment
239239+240240+(** Create an empty document node. *)
241241+let create_document = Dom.create_document
242242+243243+(** Create a document fragment node. *)
244244+let create_document_fragment = Dom.create_document_fragment
245245+246246+(** Create a doctype node. *)
247247+let create_doctype = Dom.create_doctype
248248+249249+(** Append a child node to a parent. *)
250250+let append_child = Dom.append_child
251251+252252+(** Insert a node before a reference node. *)
253253+let insert_before = Dom.insert_before
254254+255255+(** Remove a child node from its parent. *)
256256+let remove_child = Dom.remove_child
257257+258258+(** Get an attribute value. *)
259259+let get_attr = Dom.get_attr
260260+261261+(** Set an attribute value. *)
262262+let set_attr = Dom.set_attr
263263+264264+(** Check if a node has an attribute. *)
265265+let has_attr = Dom.has_attr
266266+267267+(** Get all descendant nodes. *)
268268+let descendants = Dom.descendants
269269+270270+(** Get all ancestor nodes (from parent to root). *)
271271+let ancestors = Dom.ancestors
272272+273273+(** Get text content of a node and its descendants. *)
274274+let get_text_content = Dom.get_text_content
275275+276276+(** Clone a node.
277277+ @param deep If true, also clone descendants (default: false)
278278+*)
279279+let clone = Dom.clone
280280+281281+(** {1 Node Predicates} *)
282282+283283+(** Test if a node is an element. *)
284284+let is_element = Dom.is_element
285285+286286+(** Test if a node is a text node. *)
287287+let is_text = Dom.is_text
288288+289289+(** Test if a node is a comment node. *)
290290+let is_comment = Dom.is_comment
291291+292292+(** Test if a node is a document node. *)
293293+let is_document = Dom.is_document
294294+295295+(** Test if a node is a document fragment. *)
296296+let is_document_fragment = Dom.is_document_fragment
297297+298298+(** Test if a node is a doctype node. *)
299299+let is_doctype = Dom.is_doctype
300300+301301+(** Test if a node has children. *)
302302+let has_children = Dom.has_children
+324
lib/html5rw/html5rw.mli
···11+(** Html5rw - Pure OCaml HTML5 Parser
22+33+ This module provides a complete HTML5 parsing solution following the
44+ WHATWG specification. It uses bytesrw for streaming input/output.
55+66+ {2 Quick Start}
77+88+ Parse HTML from a reader:
99+ {[
1010+ open Bytesrw
1111+ let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
1212+ let result = Html5rw.parse reader in
1313+ let html = Html5rw.to_string result
1414+ ]}
1515+1616+ Parse from a file:
1717+ {[
1818+ open Bytesrw
1919+ let ic = open_in "page.html" in
2020+ let reader = Bytes.Reader.of_in_channel ic in
2121+ let result = Html5rw.parse reader in
2222+ close_in ic
2323+ ]}
2424+2525+ Query with CSS selectors:
2626+ {[
2727+ let result = Html5rw.parse reader in
2828+ let divs = Html5rw.query result "div.content"
2929+ ]}
3030+*)
3131+3232+(** {1 Sub-modules} *)
3333+3434+(** DOM types and manipulation functions *)
3535+module Dom = Html5rw_dom
3636+3737+(** HTML5 tokenizer *)
3838+module Tokenizer = Html5rw_tokenizer
3939+4040+(** Encoding detection and decoding *)
4141+module Encoding = Html5rw_encoding
4242+4343+(** CSS selector engine *)
4444+module Selector = Html5rw_selector
4545+4646+(** HTML entity decoding *)
4747+module Entities = Html5rw_entities
4848+4949+(** Low-level parser access *)
5050+module Parser = Html5rw_parser
5151+5252+(** {1 Core Types} *)
5353+5454+(** DOM node type. See {!Dom} for manipulation functions. *)
5555+type node = Dom.node
5656+5757+(** Doctype information *)
5858+type doctype_data = Dom.doctype_data = {
5959+ name : string option;
6060+ public_id : string option;
6161+ system_id : string option;
6262+}
6363+6464+(** Quirks mode as determined during parsing *)
6565+type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
6666+6767+(** Character encoding detected or specified *)
6868+type encoding = Encoding.encoding =
6969+ | Utf8
7070+ | Utf16le
7171+ | Utf16be
7272+ | Windows_1252
7373+ | Iso_8859_2
7474+ | Euc_jp
7575+7676+(** A parse error encountered during HTML5 parsing.
7777+7878+ HTML5 parsing never fails - the specification defines error recovery
7979+ for all malformed input. However, conformance checkers can report
8080+ these errors. Enable error collection with [~collect_errors:true].
8181+8282+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
8383+ WHATWG parse error definitions
8484+*)
8585+type parse_error = Parser.parse_error
8686+8787+(** Get the error code (e.g., "unexpected-null-character"). *)
8888+val error_code : parse_error -> string
8989+9090+(** Get the line number where the error occurred (1-indexed). *)
9191+val error_line : parse_error -> int
9292+9393+(** Get the column number where the error occurred (1-indexed). *)
9494+val error_column : parse_error -> int
9595+9696+(** Context element for HTML fragment parsing (innerHTML).
9797+9898+ When parsing HTML fragments, you must specify what element would
9999+ contain the fragment. This affects how certain elements are handled.
100100+101101+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
102102+ The fragment parsing algorithm
103103+*)
104104+type fragment_context = Parser.fragment_context
105105+106106+(** Create a fragment parsing context.
107107+108108+ @param tag_name Tag name of the context element (e.g., "div", "tr")
109109+ @param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
110110+111111+ {[
112112+ (* Parse as innerHTML of a <ul> *)
113113+ let ctx = Html5rw.make_fragment_context ~tag_name:"ul" ()
114114+115115+ (* Parse as innerHTML of an SVG <g> element *)
116116+ let ctx = Html5rw.make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
117117+ ]}
118118+*)
119119+val make_fragment_context : tag_name:string -> ?namespace:string option ->
120120+ unit -> fragment_context
121121+122122+(** Get the tag name of a fragment context. *)
123123+val fragment_context_tag : fragment_context -> string
124124+125125+(** Get the namespace of a fragment context. *)
126126+val fragment_context_namespace : fragment_context -> string option
127127+128128+(** Result of parsing an HTML document.
129129+130130+ Contains the parsed DOM tree, any errors encountered, and the
131131+ detected encoding (when parsing from bytes).
132132+*)
133133+type t = {
134134+ root : node;
135135+ errors : parse_error list;
136136+ encoding : encoding option;
137137+}
138138+139139+(** {1 Parsing Functions} *)
140140+141141+(** Parse HTML from a [Bytes.Reader.t].
142142+143143+ This is the primary parsing function. Create a reader from any source:
144144+ - [Bytes.Reader.of_string s] for strings
145145+ - [Bytes.Reader.of_in_channel ic] for files
146146+ - [Bytes.Reader.of_bytes b] for byte buffers
147147+148148+ {[
149149+ open Bytesrw
150150+ let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
151151+ let result = Html5rw.parse reader
152152+ ]}
153153+154154+ @param collect_errors If true, collect parse errors (default: false)
155155+ @param fragment_context Context element for fragment parsing
156156+*)
157157+val parse : ?collect_errors:bool -> ?fragment_context:fragment_context -> Bytesrw.Bytes.Reader.t -> t
158158+159159+(** Parse raw bytes with automatic encoding detection.
160160+161161+ This function implements the WHATWG encoding sniffing algorithm:
162162+ 1. Check for BOM (Byte Order Mark)
163163+ 2. Prescan for <meta charset>
164164+ 3. Fall back to UTF-8
165165+166166+ @param collect_errors If true, collect parse errors (default: false)
167167+ @param transport_encoding Encoding from HTTP Content-Type header
168168+ @param fragment_context Context element for fragment parsing
169169+*)
170170+val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string -> ?fragment_context:fragment_context -> bytes -> t
171171+172172+(** {1 Querying} *)
173173+174174+(** Query the DOM tree with a CSS selector.
175175+176176+ Supported selectors:
177177+ - Tag: [div], [p], [span]
178178+ - ID: [#myid]
179179+ - Class: [.myclass]
180180+ - Universal: [*]
181181+ - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
182182+ - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
183183+ - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
184184+185185+ {[
186186+ let divs = Html5rw.query result "div.content > p"
187187+ ]}
188188+189189+ @raise Selector.Selector_error if the selector is invalid
190190+*)
191191+val query : t -> string -> node list
192192+193193+(** Check if a node matches a CSS selector. *)
194194+val matches : node -> string -> bool
195195+196196+(** {1 Serialization} *)
197197+198198+(** Write the DOM tree to a [Bytes.Writer.t].
199199+200200+ {[
201201+ open Bytesrw
202202+ let buf = Buffer.create 1024 in
203203+ let writer = Bytes.Writer.of_buffer buf in
204204+ Html5rw.to_writer result writer;
205205+ Bytes.Writer.write_eod writer;
206206+ let html = Buffer.contents buf
207207+ ]}
208208+209209+ @param pretty If true, format with indentation (default: true)
210210+ @param indent_size Number of spaces per indent level (default: 2)
211211+*)
212212+val to_writer : ?pretty:bool -> ?indent_size:int -> t -> Bytesrw.Bytes.Writer.t -> unit
213213+214214+(** Serialize the DOM tree to a string.
215215+216216+ Convenience function when the output fits in memory.
217217+218218+ @param pretty If true, format with indentation (default: true)
219219+ @param indent_size Number of spaces per indent level (default: 2)
220220+*)
221221+val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
222222+223223+(** Extract text content from the DOM tree.
224224+225225+ @param separator String to insert between text nodes (default: " ")
226226+ @param strip If true, trim whitespace (default: true)
227227+*)
228228+val to_text : ?separator:string -> ?strip:bool -> t -> string
229229+230230+(** Serialize to html5lib test format (for testing). *)
231231+val to_test_format : t -> string
232232+233233+(** {1 Result Accessors} *)
234234+235235+(** Get the root node of the parsed document. *)
236236+val root : t -> node
237237+238238+(** Get parse errors (if error collection was enabled). *)
239239+val errors : t -> parse_error list
240240+241241+(** Get the detected encoding (if parsed from bytes). *)
242242+val encoding : t -> encoding option
243243+244244+(** {1 DOM Utilities}
245245+246246+ Common DOM operations are available directly. For the full API,
247247+ see the {!Dom} module.
248248+*)
249249+250250+(** Create an element node.
251251+ @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
252252+ @param attrs List of (name, value) attribute pairs
253253+*)
254254+val create_element : string -> ?namespace:string option -> ?attrs:(string * string) list -> unit -> node
255255+256256+(** Create a text node. *)
257257+val create_text : string -> node
258258+259259+(** Create a comment node. *)
260260+val create_comment : string -> node
261261+262262+(** Create an empty document node. *)
263263+val create_document : unit -> node
264264+265265+(** Create a document fragment node. *)
266266+val create_document_fragment : unit -> node
267267+268268+(** Create a doctype node. *)
269269+val create_doctype : ?name:string -> ?public_id:string -> ?system_id:string -> unit -> node
270270+271271+(** Append a child node to a parent. *)
272272+val append_child : node -> node -> unit
273273+274274+(** Insert a node before a reference node. *)
275275+val insert_before : node -> node -> node -> unit
276276+277277+(** Remove a child node from its parent. *)
278278+val remove_child : node -> node -> unit
279279+280280+(** Get an attribute value. *)
281281+val get_attr : node -> string -> string option
282282+283283+(** Set an attribute value. *)
284284+val set_attr : node -> string -> string -> unit
285285+286286+(** Check if a node has an attribute. *)
287287+val has_attr : node -> string -> bool
288288+289289+(** Get all descendant nodes. *)
290290+val descendants : node -> node list
291291+292292+(** Get all ancestor nodes (from parent to root). *)
293293+val ancestors : node -> node list
294294+295295+(** Get text content of a node and its descendants. *)
296296+val get_text_content : node -> string
297297+298298+(** Clone a node.
299299+ @param deep If true, also clone descendants (default: false)
300300+*)
301301+val clone : ?deep:bool -> node -> node
302302+303303+(** {1 Node Predicates} *)
304304+305305+(** Test if a node is an element. *)
306306+val is_element : node -> bool
307307+308308+(** Test if a node is a text node. *)
309309+val is_text : node -> bool
310310+311311+(** Test if a node is a comment node. *)
312312+val is_comment : node -> bool
313313+314314+(** Test if a node is a document node. *)
315315+val is_document : node -> bool
316316+317317+(** Test if a node is a document fragment. *)
318318+val is_document_fragment : node -> bool
319319+320320+(** Test if a node is a doctype node. *)
321321+val is_doctype : node -> bool
322322+323323+(** Test if a node has children. *)
324324+val has_children : node -> bool
···11+(** HTML5 Parser
22+33+ This module provides the core HTML5 parsing functionality implementing
44+ the WHATWG parsing specification. It handles tokenization, tree construction,
55+ error recovery, and produces a DOM tree.
66+77+ For most uses, prefer the top-level {!Html5rw} module which re-exports
88+ these functions with a simpler interface.
99+1010+ {2 Parsing Algorithm}
1111+1212+ The HTML5 parsing algorithm is defined by the WHATWG specification and
1313+ consists of several phases:
1414+1515+ 1. {b Encoding sniffing}: Detect character encoding from BOM, meta tags,
1616+ or transport layer hints
1717+ 2. {b Tokenization}: Convert the input stream into a sequence of tokens
1818+ (start tags, end tags, character data, comments, etc.)
1919+ 3. {b Tree construction}: Build the DOM tree using a state machine with
2020+ multiple insertion modes
2121+2222+ The algorithm includes extensive error recovery to handle malformed HTML
2323+ in a consistent way across browsers.
2424+2525+ @see <https://html.spec.whatwg.org/multipage/parsing.html>
2626+ The WHATWG HTML Parsing specification
2727+*)
2828+2929+(** {1 Sub-modules} *)
3030+3131+module Dom = Html5rw_dom
3232+module Tokenizer = Html5rw_tokenizer
3333+module Encoding = Html5rw_encoding
3434+module Constants : sig
3535+ val void_elements : string list
3636+ val formatting_elements : string list
3737+ val special_elements : string list
3838+end
3939+module Insertion_mode : sig
4040+ type t
4141+end
4242+module Tree_builder : sig
4343+ type t
4444+end
4545+4646+(** {1 Types} *)
4747+4848+(** A parse error encountered during parsing.
4949+5050+ HTML5 parsing never fails - it always produces a DOM tree. However,
5151+ the specification defines many error conditions that conformance
5252+ checkers should report. Error collection is optional and disabled
5353+ by default for performance.
5454+5555+ Error codes follow the WHATWG specification naming convention,
5656+ e.g., "unexpected-null-character", "eof-in-tag".
5757+5858+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
5959+ The list of HTML5 parse errors
6060+*)
6161+type parse_error
6262+6363+(** Get the error code string.
6464+6565+ Error codes are lowercase with hyphens, matching the WHATWG spec names
6666+ like "unexpected-null-character" or "eof-before-tag-name".
6767+*)
6868+val error_code : parse_error -> string
6969+7070+(** Get the line number where the error occurred (1-indexed). *)
7171+val error_line : parse_error -> int
7272+7373+(** Get the column number where the error occurred (1-indexed). *)
7474+val error_column : parse_error -> int
7575+7676+(** Context element for HTML fragment parsing.
7777+7878+ When parsing an HTML fragment (innerHTML), you need to specify the
7979+ context element that would contain the fragment. This affects how
8080+ the parser handles certain elements.
8181+8282+ For example, parsing [<td>] as a fragment of a [<tr>] works differently
8383+ than parsing it as a fragment of a [<div>].
8484+8585+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
8686+ The HTML fragment parsing algorithm
8787+*)
8888+type fragment_context
8989+9090+(** Create a fragment parsing context.
9191+9292+ @param tag_name The tag name of the context element (e.g., "div", "tr")
9393+ @param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
9494+9595+ {[
9696+ (* Parse as innerHTML of a table row *)
9797+ let ctx = make_fragment_context ~tag_name:"tr" ()
9898+9999+ (* Parse as innerHTML of an SVG element *)
100100+ let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
101101+ ]}
102102+*)
103103+val make_fragment_context : tag_name:string -> ?namespace:string option ->
104104+ unit -> fragment_context
105105+106106+(** Get the tag name of a fragment context. *)
107107+val fragment_context_tag : fragment_context -> string
108108+109109+(** Get the namespace of a fragment context. *)
110110+val fragment_context_namespace : fragment_context -> string option
111111+112112+(** Result of parsing an HTML document or fragment.
113113+114114+ Contains the parsed DOM tree, any errors encountered (if error
115115+ collection was enabled), and the detected encoding (for byte input).
116116+*)
117117+type t
118118+119119+(** {1 Parsing Functions} *)
120120+121121+val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
122122+ Bytesrw.Bytes.Reader.t -> t
123123+(** Parse HTML from a byte stream reader.
124124+125125+ This is the primary parsing function. The input must be valid UTF-8
126126+ (or will be converted from detected encoding when using {!parse_bytes}).
127127+128128+ @param collect_errors If [true], collect parse errors (default: [false])
129129+ @param fragment_context Context for fragment parsing (innerHTML)
130130+131131+ {[
132132+ open Bytesrw
133133+ let reader = Bytes.Reader.of_string "<p>Hello</p>" in
134134+ let result = parse reader
135135+ ]}
136136+*)
137137+138138+val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
139139+ ?fragment_context:fragment_context -> bytes -> t
140140+(** Parse HTML bytes with automatic encoding detection.
141141+142142+ Implements the WHATWG encoding sniffing algorithm:
143143+ 1. Check for BOM (UTF-8, UTF-16LE, UTF-16BE)
144144+ 2. Prescan for [<meta charset>] declaration
145145+ 3. Use transport encoding hint if provided
146146+ 4. Fall back to UTF-8
147147+148148+ @param collect_errors If [true], collect parse errors (default: [false])
149149+ @param transport_encoding Encoding from HTTP Content-Type header
150150+ @param fragment_context Context for fragment parsing (innerHTML)
151151+*)
152152+153153+(** {1 Result Accessors} *)
154154+155155+val root : t -> Dom.node
156156+(** Get the root node of the parsed document.
157157+158158+ For full document parsing, this is a document node.
159159+ For fragment parsing, this is a document fragment node.
160160+*)
161161+162162+val errors : t -> parse_error list
163163+(** Get parse errors (empty if error collection was disabled). *)
164164+165165+val encoding : t -> Encoding.encoding option
166166+(** Get the detected encoding (only set when using {!parse_bytes}). *)
167167+168168+(** {1 Querying} *)
169169+170170+val query : t -> string -> Dom.node list
171171+(** Query the DOM with a CSS selector.
172172+173173+ @raise Html5rw_selector.Selector_error if the selector is invalid
174174+175175+ @see {!Html5rw_selector} for supported selector syntax
176176+*)
177177+178178+(** {1 Serialization} *)
179179+180180+val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
181181+ Bytesrw.Bytes.Writer.t -> unit
182182+(** Serialize the DOM tree to a byte stream writer.
183183+184184+ @param pretty If [true], format with indentation (default: [true])
185185+ @param indent_size Spaces per indent level (default: [2])
186186+*)
187187+188188+val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
189189+(** Serialize the DOM tree to a string.
190190+191191+ @param pretty If [true], format with indentation (default: [true])
192192+ @param indent_size Spaces per indent level (default: [2])
193193+*)
194194+195195+val to_text : ?separator:string -> ?strip:bool -> t -> string
196196+(** Extract text content from the DOM tree.
197197+198198+ @param separator String between text nodes (default: [" "])
199199+ @param strip If [true], trim whitespace (default: [true])
200200+*)
201201+202202+val to_test_format : t -> string
203203+(** Serialize to html5lib test format.
204204+205205+ This format is used by the html5lib test suite and shows the tree
206206+ structure with indentation and node type prefixes.
207207+*)
+51
lib/parser/insertion_mode.ml
···11+(* HTML5 tree builder insertion modes *)
22+33+type t =
44+ | Initial
55+ | Before_html
66+ | Before_head
77+ | In_head
88+ | In_head_noscript
99+ | After_head
1010+ | In_body
1111+ | Text
1212+ | In_table
1313+ | In_table_text
1414+ | In_caption
1515+ | In_column_group
1616+ | In_table_body
1717+ | In_row
1818+ | In_cell
1919+ | In_select
2020+ | In_select_in_table
2121+ | In_template
2222+ | After_body
2323+ | In_frameset
2424+ | After_frameset
2525+ | After_after_body
2626+ | After_after_frameset
2727+2828+let to_string = function
2929+ | Initial -> "initial"
3030+ | Before_html -> "before html"
3131+ | Before_head -> "before head"
3232+ | In_head -> "in head"
3333+ | In_head_noscript -> "in head noscript"
3434+ | After_head -> "after head"
3535+ | In_body -> "in body"
3636+ | Text -> "text"
3737+ | In_table -> "in table"
3838+ | In_table_text -> "in table text"
3939+ | In_caption -> "in caption"
4040+ | In_column_group -> "in column group"
4141+ | In_table_body -> "in table body"
4242+ | In_row -> "in row"
4343+ | In_cell -> "in cell"
4444+ | In_select -> "in select"
4545+ | In_select_in_table -> "in select in table"
4646+ | In_template -> "in template"
4747+ | After_body -> "after body"
4848+ | In_frameset -> "in frameset"
4949+ | After_frameset -> "after frameset"
5050+ | After_after_body -> "after after body"
5151+ | After_after_frameset -> "after after frameset"
+107
lib/parser/parser.ml
···11+(* Main parser entry point - bytesrw-only API *)
22+33+open Bytesrw
44+55+module Dom = Html5rw_dom
66+module Tokenizer = Html5rw_tokenizer
77+module Encoding = Html5rw_encoding
88+99+type parse_error = Tree_builder.parse_error
1010+1111+type fragment_context = Tree_builder.fragment_context
1212+1313+type t = {
1414+ root : Dom.node;
1515+ errors : parse_error list;
1616+ encoding : Encoding.encoding option;
1717+}
1818+1919+(* Token sink that feeds tokens to tree builder *)
2020+module TreeBuilderSink = struct
2121+ type t = Tree_builder.t
2222+2323+ let process tb token =
2424+ Tree_builder.process_token tb token;
2525+ (* Check if we need to switch tokenizer state based on current element *)
2626+ (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
2727+ match Tree_builder.current_node tb with
2828+ | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" ->
2929+ let name = node.Dom.name in
3030+ if List.mem name ["textarea"; "title"] then
3131+ `SwitchTo Tokenizer.State.Rcdata
3232+ else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
3333+ `SwitchTo Tokenizer.State.Rawtext
3434+ else if name = "script" then
3535+ `SwitchTo Tokenizer.State.Script_data
3636+ else if name = "plaintext" then
3737+ `SwitchTo Tokenizer.State.Plaintext
3838+ else
3939+ `Continue
4040+ | _ -> `Continue
4141+4242+ let adjusted_current_node_in_html_namespace tb =
4343+ Tree_builder.adjusted_current_node_in_html_namespace tb
4444+end
4545+4646+(* Core parsing function that takes a Bytes.Reader.t *)
4747+let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) =
4848+ let tb = Tree_builder.create ~collect_errors ?fragment_context () in
4949+ let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in
5050+5151+ (* Set tokenizer state for fragment parsing *)
5252+ (* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been
5353+ emitted. This means end tags won't match as "appropriate end tags" and will be treated
5454+ as raw text in RCDATA/RAWTEXT/Script modes. *)
5555+ (* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *)
5656+ (match fragment_context with
5757+ | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" ->
5858+ let name = String.lowercase_ascii ctx.tag_name in
5959+ if List.mem name ["title"; "textarea"] then
6060+ Tokenizer.set_state tokenizer Tokenizer.State.Rcdata
6161+ else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
6262+ Tokenizer.set_state tokenizer Tokenizer.State.Rawtext
6363+ else if name = "script" then
6464+ Tokenizer.set_state tokenizer Tokenizer.State.Script_data
6565+ else if name = "plaintext" then
6666+ Tokenizer.set_state tokenizer Tokenizer.State.Plaintext
6767+ | _ -> ());
6868+6969+ Tokenizer.run tokenizer (module TreeBuilderSink) reader;
7070+7171+ let root = Tree_builder.finish tb in
7272+ let tokenizer_errors = Tokenizer.get_errors tokenizer in
7373+ let tree_errors = Tree_builder.get_errors tb in
7474+ let all_errors = List.map (fun e ->
7575+ { Tree_builder.code = e.Tokenizer.Errors.code;
7676+ line = e.Tokenizer.Errors.line;
7777+ column = e.Tokenizer.Errors.column }
7878+ ) tokenizer_errors @ tree_errors in
7979+8080+ { root; errors = all_errors; encoding = None }
8181+8282+(* Parse raw bytes with automatic encoding detection *)
8383+let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data =
8484+ let (html, enc) = Encoding.decode data ?transport_encoding () in
8585+ let reader = Bytes.Reader.of_string html in
8686+ let result = parse ~collect_errors ?fragment_context reader in
8787+ { result with encoding = Some enc }
8888+8989+let query t selector =
9090+ Html5rw_selector.query t.root selector
9191+9292+(* Serialize to a Bytes.Writer.t *)
9393+let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
9494+ let html = Dom.to_html ~pretty ~indent_size t.root in
9595+ Bytes.Writer.write_string writer html
9696+9797+(* Serialize to string (convenience for when result fits in memory) *)
9898+let to_string ?(pretty=true) ?(indent_size=2) t =
9999+ Dom.to_html ~pretty ~indent_size t.root
100100+101101+(* Extract text content *)
102102+let to_text ?(separator=" ") ?(strip=true) t =
103103+ Dom.to_text ~separator ~strip t.root
104104+105105+(* For testing *)
106106+let to_test_format t =
107107+ Dom.to_test_format t.root
+2520
lib/parser/tree_builder.ml
···11+(* HTML5 Tree Builder *)
22+33+module Dom = Html5rw_dom
44+module Token = Html5rw_tokenizer.Token
55+module State = Html5rw_tokenizer.State
66+77+type fragment_context = {
88+ tag_name : string;
99+ namespace : string option;
1010+}
1111+1212+type formatting_entry =
1313+ | Marker
1414+ | Entry of {
1515+ name : string;
1616+ attrs : (string * string) list;
1717+ node : Dom.node;
1818+ }
1919+2020+type parse_error = {
2121+ code : string;
2222+ line : int;
2323+ column : int;
2424+}
2525+2626+type t = {
2727+ mutable document : Dom.node;
2828+ mutable mode : Insertion_mode.t;
2929+ mutable original_mode : Insertion_mode.t option;
3030+ mutable open_elements : Dom.node list;
3131+ mutable active_formatting : formatting_entry list;
3232+ mutable head_element : Dom.node option;
3333+ mutable form_element : Dom.node option;
3434+ mutable frameset_ok : bool;
3535+ mutable ignore_lf : bool;
3636+ mutable foster_parenting : bool;
3737+ mutable pending_table_chars : string list;
3838+ mutable template_modes : Insertion_mode.t list;
3939+ mutable quirks_mode : Dom.quirks_mode;
4040+ mutable errors : parse_error list;
4141+ collect_errors : bool;
4242+ fragment_context : fragment_context option;
4343+ mutable fragment_context_element : Dom.node option;
4444+ iframe_srcdoc : bool;
4545+}
4646+4747+let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
4848+ let is_fragment = fragment_context <> None in
4949+ let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
5050+ let t = {
5151+ document = doc;
5252+ mode = Insertion_mode.Initial;
5353+ original_mode = None;
5454+ open_elements = [];
5555+ active_formatting = [];
5656+ head_element = None;
5757+ form_element = None;
5858+ frameset_ok = true;
5959+ ignore_lf = false;
6060+ foster_parenting = false;
6161+ pending_table_chars = [];
6262+ template_modes = [];
6363+ quirks_mode = Dom.No_quirks;
6464+ errors = [];
6565+ collect_errors;
6666+ fragment_context;
6767+ fragment_context_element = None;
6868+ iframe_srcdoc;
6969+ } in
7070+ (* Initialize fragment parsing *)
7171+ (match fragment_context with
7272+ | Some ctx ->
7373+ let name = String.lowercase_ascii ctx.tag_name in
7474+ let ns = ctx.namespace in
7575+ (* Create html root *)
7676+ let root = Dom.create_element "html" () in
7777+ Dom.append_child doc root;
7878+ t.open_elements <- [root];
7979+ (* For foreign content contexts, create context element *)
8080+ (match ns with
8181+ | Some namespace when namespace <> "html" ->
8282+ let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in
8383+ Dom.append_child root context_elem;
8484+ t.open_elements <- [context_elem; root];
8585+ t.fragment_context_element <- Some context_elem
8686+ | _ -> ());
8787+ (* Set initial mode based on context *)
8888+ t.mode <- (
8989+ if name = "html" then Insertion_mode.Before_head
9090+ else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then
9191+ Insertion_mode.In_table_body
9292+ else if name = "tr" && (ns = None || ns = Some "html") then
9393+ Insertion_mode.In_row
9494+ else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then
9595+ Insertion_mode.In_cell
9696+ else if name = "caption" && (ns = None || ns = Some "html") then
9797+ Insertion_mode.In_caption
9898+ else if name = "colgroup" && (ns = None || ns = Some "html") then
9999+ Insertion_mode.In_column_group
100100+ else if name = "table" && (ns = None || ns = Some "html") then
101101+ Insertion_mode.In_table
102102+ else if name = "template" && (ns = None || ns = Some "html") then begin
103103+ t.template_modes <- [Insertion_mode.In_template];
104104+ Insertion_mode.In_template
105105+ end
106106+ else
107107+ Insertion_mode.In_body
108108+ );
109109+ t.frameset_ok <- false
110110+ | None -> ());
111111+ t
112112+113113+(* Error handling *)
114114+let parse_error t code =
115115+ if t.collect_errors then
116116+ t.errors <- { code; line = 0; column = 0 } :: t.errors
117117+118118+(* Stack helpers *)
119119+let current_node t =
120120+ match t.open_elements with
121121+ | [] -> None
122122+ | x :: _ -> Some x
123123+124124+let adjusted_current_node t =
125125+ match t.fragment_context, t.open_elements with
126126+ | Some ctx, [_] ->
127127+ (* Fragment case: use context element info *)
128128+ Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ())
129129+ | _, x :: _ -> Some x
130130+ | _, [] -> None
131131+132132+let is_in_html_namespace node =
133133+ node.Dom.namespace = None || node.Dom.namespace = Some "html"
134134+135135+(* Namespace-aware check for "special" elements per WHATWG spec *)
136136+let is_special_element node =
137137+ let name = String.lowercase_ascii node.Dom.name in
138138+ match node.Dom.namespace with
139139+ | None | Some "html" -> Constants.is_special name
140140+ | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
141141+ | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
142142+ | _ -> false
143143+144144+let adjusted_current_node_in_html_namespace t =
145145+ match adjusted_current_node t with
146146+ | Some node -> is_in_html_namespace node
147147+ | None -> true
148148+149149+(* Insertion helpers *)
150150+let appropriate_insertion_place t =
151151+ match current_node t with
152152+ | None -> (t.document, None)
153153+ | Some target ->
154154+ if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
155155+ (* Foster parenting per WHATWG spec *)
156156+ (* Step 1: Find last (most recent) template and table in stack *)
157157+ (* Note: index 0 = top of stack = most recently added *)
158158+ let last_template_idx = ref None in
159159+ let last_table_idx = ref None in
160160+ List.iteri (fun i n ->
161161+ (* Take first match (most recent = lowest index) *)
162162+ if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i;
163163+ if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i
164164+ ) t.open_elements;
165165+166166+ (* Step 2-3: If last template is more recent than last table (lower index = more recent) *)
167167+ match !last_template_idx, !last_table_idx with
168168+ | Some ti, None ->
169169+ (* No table, use template content *)
170170+ let template = List.nth t.open_elements ti in
171171+ (match template.Dom.template_content with
172172+ | Some tc -> (tc, None)
173173+ | None -> (template, None))
174174+ | Some ti, Some tbi when ti < tbi ->
175175+ (* Template is more recent than table, use template content *)
176176+ let template = List.nth t.open_elements ti in
177177+ (match template.Dom.template_content with
178178+ | Some tc -> (tc, None)
179179+ | None -> (template, None))
180180+ | _, Some tbi ->
181181+ (* Use table's parent as foster parent *)
182182+ let table = List.nth t.open_elements tbi in
183183+ (match table.Dom.parent with
184184+ | Some parent -> (parent, Some table)
185185+ | None ->
186186+ (* Step 6: element above table in stack (index + 1 since 0 is top) *)
187187+ if tbi + 1 < List.length t.open_elements then
188188+ (List.nth t.open_elements (tbi + 1), None)
189189+ else
190190+ (t.document, None))
191191+ | None, None ->
192192+ (* No table or template, use document *)
193193+ (t.document, None)
194194+ end else begin
195195+ (* If target is a template, insert into its content document fragment *)
196196+ match target.Dom.template_content with
197197+ | Some tc -> (tc, None)
198198+ | None -> (target, None)
199199+ end
200200+201201+let insert_element t name ?(namespace=None) ?(push=false) attrs =
202202+ let node = Dom.create_element name ~namespace ~attrs () in
203203+ let (parent, before) = appropriate_insertion_place t in
204204+ (match before with
205205+ | None -> Dom.append_child parent node
206206+ | Some ref -> Dom.insert_before parent node ref);
207207+ if push then t.open_elements <- node :: t.open_elements;
208208+ node
209209+210210+let insert_element_for_token t (tag : Token.tag) =
211211+ insert_element t tag.name ~push:true tag.attrs
212212+213213+let insert_foreign_element t (tag : Token.tag) namespace =
214214+ let attrs =
215215+ if namespace = Some "svg" then
216216+ Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs tag.attrs)
217217+ else
218218+ Constants.adjust_foreign_attrs tag.attrs
219219+ in
220220+ let name =
221221+ if namespace = Some "svg" then Constants.adjust_svg_tag_name tag.name
222222+ else tag.name
223223+ in
224224+ let node = insert_element t name ~namespace attrs in
225225+ t.open_elements <- node :: t.open_elements;
226226+ node
227227+228228+let insert_character t data =
229229+ if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin
230230+ t.ignore_lf <- false;
231231+ if String.length data > 1 then begin
232232+ let rest = String.sub data 1 (String.length data - 1) in
233233+ let (parent, before) = appropriate_insertion_place t in
234234+ Dom.insert_text_at parent rest before
235235+ end
236236+ end else begin
237237+ t.ignore_lf <- false;
238238+ let (parent, before) = appropriate_insertion_place t in
239239+ Dom.insert_text_at parent data before
240240+ end
241241+242242+let insert_comment t data =
243243+ let node = Dom.create_comment data in
244244+ let (parent, _) = appropriate_insertion_place t in
245245+ Dom.append_child parent node
246246+247247+let insert_comment_to_document t data =
248248+ let node = Dom.create_comment data in
249249+ Dom.append_child t.document node
250250+251251+(* Stack manipulation *)
252252+let pop_current t =
253253+ match t.open_elements with
254254+ | [] -> ()
255255+ | _ :: rest -> t.open_elements <- rest
256256+257257+let pop_until t pred =
258258+ let rec loop () =
259259+ match t.open_elements with
260260+ | [] -> ()
261261+ | x :: rest ->
262262+ t.open_elements <- rest;
263263+ if not (pred x) then loop ()
264264+ in
265265+ loop ()
266266+267267+let pop_until_tag t name =
268268+ pop_until t (fun n -> n.Dom.name = name)
269269+270270+(* Pop until HTML namespace element with given name *)
271271+let pop_until_html_tag t name =
272272+ pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n)
273273+274274+let pop_until_one_of t names =
275275+ pop_until t (fun n -> List.mem n.Dom.name names)
276276+277277+(* Pop until HTML namespace element with one of given names *)
278278+let pop_until_html_one_of t names =
279279+ pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n)
280280+281281+(* Check if element is an HTML integration point *)
282282+let is_html_integration_point node =
283283+ (* SVG foreignObject, desc, and title are always HTML integration points *)
284284+ if node.Dom.namespace = Some "svg" &&
285285+ List.mem node.Dom.name Constants.svg_html_integration then true
286286+ (* annotation-xml is an HTML integration point only with specific encoding values *)
287287+ else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
288288+ match List.assoc_opt "encoding" node.Dom.attrs with
289289+ | Some enc ->
290290+ let enc_lower = String.lowercase_ascii enc in
291291+ enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
292292+ | None -> false
293293+ else false
294294+295295+(* Check if element is a MathML text integration point *)
296296+let is_mathml_text_integration_point node =
297297+ node.Dom.namespace = Some "mathml" &&
298298+ List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
299299+300300+(* Scope checks - integration points also terminate scope (except for table scope) *)
301301+(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
302302+let has_element_in_scope_impl t names exclude_list ~check_integration_points =
303303+ let rec check = function
304304+ | [] -> false
305305+ | n :: rest ->
306306+ (* Target elements must be in HTML namespace *)
307307+ if is_in_html_namespace n && List.mem n.Dom.name names then true
308308+ else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false
309309+ (* Integration points terminate scope (unless we're checking table scope) *)
310310+ else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false
311311+ else check rest
312312+ in
313313+ check t.open_elements
314314+315315+let has_element_in_scope t name =
316316+ has_element_in_scope_impl t [name] Constants.default_scope ~check_integration_points:true
317317+318318+let has_element_in_button_scope t name =
319319+ has_element_in_scope_impl t [name] Constants.button_scope ~check_integration_points:true
320320+321321+let has_element_in_list_item_scope t name =
322322+ has_element_in_scope_impl t [name] Constants.list_item_scope ~check_integration_points:true
323323+324324+let has_element_in_table_scope t name =
325325+ has_element_in_scope_impl t [name] Constants.table_scope ~check_integration_points:false
326326+327327+let has_element_in_select_scope t name =
328328+ let rec check = function
329329+ | [] -> false
330330+ | n :: rest ->
331331+ if n.Dom.name = name then true
332332+ else if not (List.mem n.Dom.name Constants.select_scope_exclude) then false
333333+ else check rest
334334+ in
335335+ check t.open_elements
336336+337337+(* Implied end tags *)
338338+let generate_implied_end_tags t ?except () =
339339+ let rec loop () =
340340+ match current_node t with
341341+ | Some n when List.mem n.Dom.name Constants.implied_end_tags ->
342342+ (match except with
343343+ | Some ex when n.Dom.name = ex -> ()
344344+ | _ -> pop_current t; loop ())
345345+ | _ -> ()
346346+ in
347347+ loop ()
348348+349349+let generate_all_implied_end_tags t =
350350+ let rec loop () =
351351+ match current_node t with
352352+ | Some n when List.mem n.Dom.name Constants.thoroughly_implied_end_tags ->
353353+ pop_current t; loop ()
354354+ | _ -> ()
355355+ in
356356+ loop ()
357357+358358+(* Active formatting elements *)
359359+let push_formatting_marker t =
360360+ t.active_formatting <- Marker :: t.active_formatting
361361+362362+let push_formatting_element t node name attrs =
363363+ (* Noah's Ark: remove earlier identical elements (up to 3) *)
364364+ let rec count_and_remove same acc = function
365365+ | [] -> List.rev acc
366366+ | Marker :: rest -> List.rev acc @ (Marker :: rest)
367367+ | Entry e :: rest when e.name = name && e.attrs = attrs ->
368368+ if same >= 2 then
369369+ count_and_remove same acc rest (* Remove this one *)
370370+ else
371371+ count_and_remove (same + 1) (Entry e :: acc) rest
372372+ | x :: rest -> count_and_remove same (x :: acc) rest
373373+ in
374374+ t.active_formatting <- count_and_remove 0 [] t.active_formatting;
375375+ t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting
376376+377377+let clear_active_formatting_to_marker t =
378378+ let rec loop = function
379379+ | [] -> []
380380+ | Marker :: rest -> rest
381381+ | _ :: rest -> loop rest
382382+ in
383383+ t.active_formatting <- loop t.active_formatting
384384+385385+let reconstruct_active_formatting t =
386386+ let rec find_to_reconstruct acc = function
387387+ | [] -> acc
388388+ | Marker :: _ -> acc
389389+ | Entry e :: rest ->
390390+ if List.exists (fun n -> n == e.node) t.open_elements then acc
391391+ else find_to_reconstruct (Entry e :: acc) rest
392392+ in
393393+ let to_reconstruct = find_to_reconstruct [] t.active_formatting in
394394+ List.iter (fun entry ->
395395+ match entry with
396396+ | Entry e ->
397397+ let node = insert_element t e.name e.attrs in
398398+ t.open_elements <- node :: t.open_elements;
399399+ (* Update the entry to point to new node *)
400400+ t.active_formatting <- List.map (fun x ->
401401+ if x == entry then Entry { e with node }
402402+ else x
403403+ ) t.active_formatting
404404+ | Marker -> ()
405405+ ) to_reconstruct
406406+407407+(* Adoption agency algorithm - follows WHATWG spec *)
408408+let adoption_agency t tag_name =
409409+ (* Step 1: If current node is subject and not in active formatting list, just pop *)
410410+ (match current_node t with
411411+ | Some n when n.Dom.name = tag_name ->
412412+ let in_active = List.exists (function
413413+ | Entry e -> e.name = tag_name
414414+ | Marker -> false
415415+ ) t.active_formatting in
416416+ if not in_active then begin
417417+ pop_current t;
418418+ () (* Return early - this case is handled *)
419419+ end
420420+ | _ -> ());
421421+422422+ (* Step 2: Outer loop *)
423423+ let outer_loop_counter = ref 0 in
424424+ let done_flag = ref false in
425425+426426+ while !outer_loop_counter < 8 && not !done_flag do
427427+ incr outer_loop_counter;
428428+429429+ (* Step 3: Find formatting element in active formatting list *)
430430+ let rec find_formatting_index idx = function
431431+ | [] -> None
432432+ | Marker :: _ -> None
433433+ | Entry e :: rest ->
434434+ if e.name = tag_name then Some (idx, e.node, e.attrs)
435435+ else find_formatting_index (idx + 1) rest
436436+ in
437437+ let formatting_entry = find_formatting_index 0 t.active_formatting in
438438+439439+ match formatting_entry with
440440+ | None ->
441441+ (* No formatting element found - done *)
442442+ done_flag := true
443443+ | Some (fmt_idx, fmt_node, fmt_attrs) ->
444444+445445+ (* Step 4: Check if formatting element is in open elements *)
446446+ if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin
447447+ parse_error t "adoption-agency-1.2";
448448+ t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
449449+ done_flag := true
450450+ end
451451+ (* Step 5: Check if formatting element is in scope *)
452452+ else if not (has_element_in_scope t tag_name) then begin
453453+ parse_error t "adoption-agency-1.3";
454454+ done_flag := true
455455+ end else begin
456456+ (* Step 6: Parse error if not current node *)
457457+ (match current_node t with
458458+ | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3"
459459+ | _ -> ());
460460+461461+ (* Step 7: Find furthest block - first special element BELOW formatting element *)
462462+ (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *)
463463+ (* We need the first special element going from formatting element toward current *)
464464+ (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *)
465465+ let fmt_stack_idx = ref (-1) in
466466+ List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements;
467467+ let furthest_block =
468468+ if !fmt_stack_idx <= 0 then None
469469+ else begin
470470+ (* Look from fmt_stack_idx-1 down to 0, find first special element *)
471471+ let rec find_from_idx idx =
472472+ if idx < 0 then None
473473+ else
474474+ let n = List.nth t.open_elements idx in
475475+ if is_special_element n then Some n
476476+ else find_from_idx (idx - 1)
477477+ in
478478+ find_from_idx (!fmt_stack_idx - 1)
479479+ end
480480+ in
481481+482482+ match furthest_block with
483483+ | None ->
484484+ (* Step 8: No furthest block - pop elements including formatting element *)
485485+ pop_until t (fun n -> n == fmt_node);
486486+ t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
487487+ done_flag := true
488488+489489+ | Some fb ->
490490+ (* Step 9: Let common ancestor be element immediately above formatting element *)
491491+ let rec find_common_ancestor = function
492492+ | [] -> None
493493+ | n :: rest when n == fmt_node ->
494494+ (match rest with x :: _ -> Some x | [] -> None)
495495+ | _ :: rest -> find_common_ancestor rest
496496+ in
497497+ let common_ancestor = find_common_ancestor t.open_elements in
498498+499499+ (* Step 10: Bookmark starts after formatting element *)
500500+ let bookmark = ref (fmt_idx + 1) in
501501+502502+ (* Step 11: Let last_node = furthest block *)
503503+ let last_node = ref fb in
504504+505505+ (* Step 12: Inner loop *)
506506+ (* The inner loop processes elements between furthest_block and formatting_element,
507507+ removing non-formatting elements and reparenting formatting elements *)
508508+ let inner_loop_counter = ref 0 in
509509+510510+ (* Get index of furthest block in open elements *)
511511+ let fb_idx = ref 0 in
512512+ List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements;
513513+514514+ (* Start from element after furthest block (toward formatting element) *)
515515+ let node_idx = ref (!fb_idx + 1) in
516516+517517+ while !node_idx < List.length t.open_elements &&
518518+ (List.nth t.open_elements !node_idx) != fmt_node do
519519+ incr inner_loop_counter;
520520+ let current_node = List.nth t.open_elements !node_idx in
521521+522522+ (* Step 12.3: Find node in active formatting list *)
523523+ let rec find_node_in_formatting idx = function
524524+ | [] -> None
525525+ | Entry e :: _rest when e.node == current_node -> Some idx
526526+ | _ :: rest -> find_node_in_formatting (idx + 1) rest
527527+ in
528528+ let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in
529529+530530+ (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *)
531531+ let node_fmt_idx =
532532+ match node_fmt_idx with
533533+ | Some idx when !inner_loop_counter > 3 ->
534534+ t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting;
535535+ if idx < !bookmark then decr bookmark;
536536+ None
537537+ | x -> x
538538+ in
539539+540540+ (* Step 12.5: If node not in active formatting, remove from stack and continue *)
541541+ match node_fmt_idx with
542542+ | None ->
543543+ (* Remove from stack - this shifts indices *)
544544+ t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements
545545+ (* Don't increment node_idx since we removed an element *)
546546+547547+ | Some af_idx ->
548548+ (* Step 12.6: Create new element for node *)
549549+ let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with
550550+ | Entry e -> (e.name, e.attrs)
551551+ | Marker -> failwith "unexpected marker"
552552+ in
553553+ let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in
554554+555555+ (* Update active formatting with new node *)
556556+ t.active_formatting <- List.mapi (fun i entry ->
557557+ if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs }
558558+ else entry
559559+ ) t.active_formatting;
560560+561561+ (* Replace node in open elements *)
562562+ t.open_elements <- List.mapi (fun i n ->
563563+ if i = !node_idx then new_node_elem else n
564564+ ) t.open_elements;
565565+566566+ (* Step 12.7: If last_node is furthest block, update bookmark *)
567567+ if !last_node == fb then
568568+ bookmark := af_idx + 1;
569569+570570+ (* Step 12.8: Reparent last_node to new node *)
571571+ (match !last_node.Dom.parent with
572572+ | Some p -> Dom.remove_child p !last_node
573573+ | None -> ());
574574+ Dom.append_child new_node_elem !last_node;
575575+576576+ (* Step 12.9: Let last_node = new node *)
577577+ last_node := new_node_elem;
578578+579579+ (* Move to next element *)
580580+ incr node_idx
581581+ done;
582582+583583+ (* Step 13: Insert last_node into common ancestor *)
584584+ (match common_ancestor with
585585+ | Some ca ->
586586+ (match !last_node.Dom.parent with
587587+ | Some p -> Dom.remove_child p !last_node
588588+ | None -> ());
589589+ (* Check if we need foster parenting *)
590590+ if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
591591+ (* Find table and insert before it *)
592592+ let rec find_table = function
593593+ | [] -> None
594594+ | n :: rest when n.Dom.name = "table" -> Some (n, rest)
595595+ | _ :: rest -> find_table rest
596596+ in
597597+ match find_table t.open_elements with
598598+ | Some (table, _) ->
599599+ (match table.Dom.parent with
600600+ | Some parent -> Dom.insert_before parent !last_node table
601601+ | None -> Dom.append_child ca !last_node)
602602+ | None -> Dom.append_child ca !last_node
603603+ end else begin
604604+ (* If common ancestor is template, insert into its content *)
605605+ match ca.Dom.template_content with
606606+ | Some tc -> Dom.append_child tc !last_node
607607+ | None -> Dom.append_child ca !last_node
608608+ end
609609+ | None -> ());
610610+611611+ (* Step 14: Create new formatting element *)
612612+ let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in
613613+614614+ (* Step 15: Move children of furthest block to new formatting element *)
615615+ let fb_children = fb.Dom.children in
616616+ List.iter (fun child ->
617617+ Dom.remove_child fb child;
618618+ Dom.append_child new_formatting child
619619+ ) fb_children;
620620+621621+ (* Step 16: Append new formatting element to furthest block *)
622622+ Dom.append_child fb new_formatting;
623623+624624+ (* Step 17: Remove old from active formatting, insert new at bookmark *)
625625+ let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in
626626+ t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
627627+ (* Adjust bookmark since we removed an element *)
628628+ let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in
629629+ let rec insert_at_bookmark idx acc = function
630630+ | [] -> List.rev (new_entry :: acc)
631631+ | x :: rest when idx = adjusted_bookmark ->
632632+ List.rev_append acc (new_entry :: x :: rest)
633633+ | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest
634634+ in
635635+ t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting;
636636+637637+ (* Step 18: Remove formatting element from open elements, insert new after furthest block *)
638638+ (* "After" in stack terms means new_formatting should be between fb and current node *)
639639+ (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *)
640640+ t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements;
641641+ (* Find fb and insert new_formatting before it (lower index = closer to current) *)
642642+ let rec insert_before acc = function
643643+ | [] -> List.rev (new_formatting :: acc)
644644+ | n :: rest when n == fb ->
645645+ (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *)
646646+ List.rev_append acc (new_formatting :: n :: rest)
647647+ | n :: rest -> insert_before (n :: acc) rest
648648+ in
649649+ t.open_elements <- insert_before [] t.open_elements
650650+ (* Continue outer loop *)
651651+ end
652652+ done
653653+654654+(* Close p element *)
655655+let close_p_element t =
656656+ generate_implied_end_tags t ~except:"p" ();
657657+ (match current_node t with
658658+ | Some n when n.Dom.name <> "p" -> parse_error t "expected-p"
659659+ | _ -> ());
660660+ pop_until_tag t "p"
661661+662662+(* Reset insertion mode *)
663663+let reset_insertion_mode t =
664664+ let rec check_node last = function
665665+ | [] -> t.mode <- Insertion_mode.In_body
666666+ | node :: rest ->
667667+ let is_last = rest = [] in
668668+ let node_to_check =
669669+ if is_last then
670670+ match t.fragment_context with
671671+ | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()
672672+ | None -> node
673673+ else node
674674+ in
675675+ let name = node_to_check.Dom.name in
676676+ if name = "select" then begin
677677+ if not is_last then begin
678678+ let rec find_table_or_template = function
679679+ | [] -> ()
680680+ | n :: rest ->
681681+ if n.Dom.name = "template" then t.mode <- Insertion_mode.In_select
682682+ else if n.Dom.name = "table" then t.mode <- Insertion_mode.In_select_in_table
683683+ else find_table_or_template rest
684684+ in
685685+ find_table_or_template rest
686686+ end;
687687+ if t.mode <> Insertion_mode.In_select_in_table then
688688+ t.mode <- Insertion_mode.In_select
689689+ end else if List.mem name ["td"; "th"] && not is_last then
690690+ t.mode <- Insertion_mode.In_cell
691691+ else if name = "tr" then
692692+ t.mode <- Insertion_mode.In_row
693693+ else if List.mem name ["tbody"; "thead"; "tfoot"] then
694694+ t.mode <- Insertion_mode.In_table_body
695695+ else if name = "caption" then
696696+ t.mode <- Insertion_mode.In_caption
697697+ else if name = "colgroup" then
698698+ t.mode <- Insertion_mode.In_column_group
699699+ else if name = "table" then
700700+ t.mode <- Insertion_mode.In_table
701701+ else if name = "template" then
702702+ t.mode <- (match t.template_modes with m :: _ -> m | [] -> Insertion_mode.In_template)
703703+ else if name = "head" && not is_last then
704704+ t.mode <- Insertion_mode.In_head
705705+ else if name = "body" then
706706+ t.mode <- Insertion_mode.In_body
707707+ else if name = "frameset" then
708708+ t.mode <- Insertion_mode.In_frameset
709709+ else if name = "html" then
710710+ t.mode <- (if t.head_element = None then Insertion_mode.Before_head else Insertion_mode.After_head)
711711+ else if is_last then
712712+ t.mode <- Insertion_mode.In_body
713713+ else
714714+ check_node last rest
715715+ in
716716+ check_node false t.open_elements
717717+718718+let is_whitespace s =
719719+ let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in
720720+ String.for_all (fun c -> List.mem c ws) s
721721+722722+(* Mode handlers *)
723723+let rec process_initial t token =
724724+ match token with
725725+ | Token.Character data when is_whitespace data -> ()
726726+ | Token.Comment data -> insert_comment_to_document t data
727727+ | Token.Doctype dt ->
728728+ let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id () in
729729+ Dom.append_child t.document node;
730730+ (* Quirks mode detection *)
731731+ if dt.force_quirks then
732732+ t.quirks_mode <- Dom.Quirks
733733+ else if dt.name <> Some "html" then
734734+ t.quirks_mode <- Dom.Quirks
735735+ else begin
736736+ let pub = Option.map String.lowercase_ascii dt.public_id in
737737+ let sys = Option.map String.lowercase_ascii dt.system_id in
738738+ let is_quirky =
739739+ (match pub with
740740+ | Some p -> List.mem p Constants.quirky_public_matches ||
741741+ List.exists (fun prefix -> String.length p >= String.length prefix &&
742742+ String.sub p 0 (String.length prefix) = prefix) Constants.quirky_public_prefixes
743743+ | None -> false) ||
744744+ (match sys with
745745+ | Some s -> List.mem s Constants.quirky_system_matches
746746+ | None -> false)
747747+ in
748748+ if is_quirky then t.quirks_mode <- Dom.Quirks
749749+ else begin
750750+ let is_limited_quirky =
751751+ match pub with
752752+ | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
753753+ String.sub p 0 (String.length prefix) = prefix)
754754+ Constants.limited_quirky_public_prefixes
755755+ | None -> false
756756+ in
757757+ if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
758758+ end
759759+ end;
760760+ t.mode <- Insertion_mode.Before_html
761761+ | _ ->
762762+ parse_error t "expected-doctype-but-got-other";
763763+ t.quirks_mode <- Dom.Quirks;
764764+ t.mode <- Insertion_mode.Before_html;
765765+ process_token t token
766766+767767+and process_before_html t token =
768768+ match token with
769769+ | Token.Doctype _ -> parse_error t "unexpected-doctype"
770770+ | Token.Comment data -> insert_comment_to_document t data
771771+ | Token.Character data when is_whitespace data -> ()
772772+ | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
773773+ let html = insert_element t "html" attrs in
774774+ t.open_elements <- [html];
775775+ t.mode <- Insertion_mode.Before_head
776776+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
777777+ let html = insert_element t "html" [] in
778778+ t.open_elements <- [html];
779779+ t.mode <- Insertion_mode.Before_head;
780780+ process_token t token
781781+ | Token.Tag { kind = Token.End; _ } ->
782782+ parse_error t "unexpected-end-tag"
783783+ | _ ->
784784+ let html = insert_element t "html" [] in
785785+ t.open_elements <- [html];
786786+ t.mode <- Insertion_mode.Before_head;
787787+ process_token t token
788788+789789+and process_before_head t token =
790790+ match token with
791791+ | Token.Character data when is_whitespace data -> ()
792792+ | Token.Comment data -> insert_comment t data
793793+ | Token.Doctype _ -> parse_error t "unexpected-doctype"
794794+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
795795+ process_in_body t token
796796+ | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } ->
797797+ let head = insert_element t "head" attrs in
798798+ t.open_elements <- head :: t.open_elements;
799799+ t.head_element <- Some head;
800800+ t.mode <- Insertion_mode.In_head
801801+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
802802+ let head = insert_element t "head" [] in
803803+ t.open_elements <- head :: t.open_elements;
804804+ t.head_element <- Some head;
805805+ t.mode <- Insertion_mode.In_head;
806806+ process_token t token
807807+ | Token.Tag { kind = Token.End; _ } ->
808808+ parse_error t "unexpected-end-tag"
809809+ | _ ->
810810+ let head = insert_element t "head" [] in
811811+ t.open_elements <- head :: t.open_elements;
812812+ t.head_element <- Some head;
813813+ t.mode <- Insertion_mode.In_head;
814814+ process_token t token
815815+816816+and process_in_head t token =
817817+ match token with
818818+ | Token.Character data when is_whitespace data ->
819819+ insert_character t data
820820+ | Token.Character data ->
821821+ (* Extract leading whitespace *)
822822+ let rec count_leading_ws i =
823823+ if i >= String.length data then i
824824+ else match data.[i] with
825825+ | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1)
826826+ | _ -> i
827827+ in
828828+ let ws_count = count_leading_ws 0 in
829829+ let leading_ws = String.sub data 0 ws_count in
830830+ let remaining = String.sub data ws_count (String.length data - ws_count) in
831831+ (* If there's leading whitespace and current element has children, insert it *)
832832+ if leading_ws <> "" then
833833+ (match current_node t with
834834+ | Some n when n.Dom.children <> [] -> insert_character t leading_ws
835835+ | _ -> ());
836836+ pop_current t;
837837+ t.mode <- Insertion_mode.After_head;
838838+ process_token t (Token.Character remaining)
839839+ | Token.Comment data ->
840840+ insert_comment t data
841841+ | Token.Doctype _ ->
842842+ parse_error t "unexpected-doctype"
843843+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
844844+ process_in_body t token
845845+ | Token.Tag { kind = Token.Start; name; attrs; _ }
846846+ when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
847847+ ignore (insert_element t name attrs)
848848+ | Token.Tag { kind = Token.Start; name = "title"; _ } ->
849849+ ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false });
850850+ t.original_mode <- Some t.mode;
851851+ t.mode <- Insertion_mode.Text
852852+ | Token.Tag { kind = Token.Start; name; _ }
853853+ when List.mem name ["noframes"; "style"] ->
854854+ ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
855855+ t.original_mode <- Some t.mode;
856856+ t.mode <- Insertion_mode.Text
857857+ | Token.Tag { kind = Token.Start; name = "noscript"; _ } ->
858858+ (* Scripting is disabled: parse noscript content as HTML *)
859859+ ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false });
860860+ t.mode <- Insertion_mode.In_head_noscript
861861+ | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
862862+ ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
863863+ t.original_mode <- Some t.mode;
864864+ t.mode <- Insertion_mode.Text
865865+ | Token.Tag { kind = Token.End; name = "head"; _ } ->
866866+ pop_current t;
867867+ t.mode <- Insertion_mode.After_head
868868+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
869869+ pop_current t;
870870+ t.mode <- Insertion_mode.After_head;
871871+ process_token t token
872872+ | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
873873+ let node = Dom.create_template ~attrs () in
874874+ let (parent, _) = appropriate_insertion_place t in
875875+ Dom.append_child parent node;
876876+ t.open_elements <- node :: t.open_elements;
877877+ push_formatting_marker t;
878878+ t.frameset_ok <- false;
879879+ t.mode <- Insertion_mode.In_template;
880880+ t.template_modes <- Insertion_mode.In_template :: t.template_modes
881881+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
882882+ if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
883883+ parse_error t "unexpected-end-tag"
884884+ else begin
885885+ generate_all_implied_end_tags t;
886886+ (match current_node t with
887887+ | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag"
888888+ | _ -> ());
889889+ pop_until_html_tag t "template";
890890+ clear_active_formatting_to_marker t;
891891+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
892892+ reset_insertion_mode t
893893+ end
894894+ | Token.Tag { kind = Token.Start; name = "head"; _ } ->
895895+ parse_error t "unexpected-start-tag"
896896+ | Token.Tag { kind = Token.End; _ } ->
897897+ parse_error t "unexpected-end-tag"
898898+ | _ ->
899899+ pop_current t;
900900+ t.mode <- Insertion_mode.After_head;
901901+ process_token t token
902902+903903+and process_in_head_noscript t token =
904904+ match token with
905905+ | Token.Character data when is_whitespace data ->
906906+ process_in_head t token
907907+ | Token.Character _ ->
908908+ parse_error t "unexpected-char-in-noscript";
909909+ pop_current t; (* Pop noscript *)
910910+ t.mode <- Insertion_mode.In_head;
911911+ process_token t token
912912+ | Token.Comment _ ->
913913+ process_in_head t token
914914+ | Token.Doctype _ ->
915915+ parse_error t "unexpected-doctype"
916916+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
917917+ process_in_body t token
918918+ | Token.Tag { kind = Token.Start; name; _ }
919919+ when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] ->
920920+ process_in_head t token
921921+ | Token.Tag { kind = Token.Start; name; _ }
922922+ when List.mem name ["head"; "noscript"] ->
923923+ parse_error t "unexpected-start-tag"
924924+ | Token.Tag { kind = Token.Start; _ } ->
925925+ parse_error t "unexpected-start-tag";
926926+ pop_current t; (* Pop noscript *)
927927+ t.mode <- Insertion_mode.In_head;
928928+ process_token t token
929929+ | Token.Tag { kind = Token.End; name = "noscript"; _ } ->
930930+ pop_current t; (* Pop noscript *)
931931+ t.mode <- Insertion_mode.In_head
932932+ | Token.Tag { kind = Token.End; name = "br"; _ } ->
933933+ parse_error t "unexpected-end-tag";
934934+ pop_current t; (* Pop noscript *)
935935+ t.mode <- Insertion_mode.In_head;
936936+ process_token t token
937937+ | Token.Tag { kind = Token.End; _ } ->
938938+ parse_error t "unexpected-end-tag"
939939+ | Token.EOF ->
940940+ parse_error t "expected-closing-tag-but-got-eof";
941941+ pop_current t; (* Pop noscript *)
942942+ t.mode <- Insertion_mode.In_head;
943943+ process_token t token
944944+945945+and process_after_head t token =
946946+ match token with
947947+ | Token.Character data when is_whitespace data ->
948948+ insert_character t data
949949+ | Token.Comment data ->
950950+ insert_comment t data
951951+ | Token.Doctype _ ->
952952+ parse_error t "unexpected-doctype"
953953+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
954954+ process_in_body t token
955955+ | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
956956+ ignore (insert_element t "body" ~push:true attrs);
957957+ t.frameset_ok <- false;
958958+ t.mode <- Insertion_mode.In_body
959959+ | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
960960+ ignore (insert_element t "frameset" ~push:true attrs);
961961+ t.mode <- Insertion_mode.In_frameset
962962+ | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
963963+ (* Special handling for input type="hidden" - parse error, ignore *)
964964+ let input_type = List.assoc_opt "type" attrs in
965965+ (match input_type with
966966+ | Some typ when String.lowercase_ascii typ = "hidden" ->
967967+ parse_error t "unexpected-hidden-input-after-head"
968968+ | _ ->
969969+ (* Non-hidden input creates body *)
970970+ let body = insert_element t "body" [] in
971971+ t.open_elements <- body :: t.open_elements;
972972+ t.mode <- Insertion_mode.In_body;
973973+ process_token t token)
974974+ | Token.Tag { kind = Token.Start; name; _ }
975975+ when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
976976+ parse_error t "unexpected-start-tag";
977977+ (match t.head_element with
978978+ | Some head ->
979979+ t.open_elements <- head :: t.open_elements;
980980+ process_in_head t token;
981981+ t.open_elements <- List.filter (fun n -> n != head) t.open_elements
982982+ | None -> ())
983983+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
984984+ process_in_head t token
985985+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
986986+ let body = insert_element t "body" [] in
987987+ t.open_elements <- body :: t.open_elements;
988988+ t.mode <- Insertion_mode.In_body;
989989+ process_token t token
990990+ | Token.Tag { kind = Token.Start; name = "head"; _ } ->
991991+ parse_error t "unexpected-start-tag"
992992+ | Token.Tag { kind = Token.End; _ } ->
993993+ parse_error t "unexpected-end-tag"
994994+ | _ ->
995995+ let body = insert_element t "body" [] in
996996+ t.open_elements <- body :: t.open_elements;
997997+ t.mode <- Insertion_mode.In_body;
998998+ process_token t token
999999+10001000+and process_in_body t token =
10011001+ match token with
10021002+ | Token.Character "\x00" ->
10031003+ parse_error t "unexpected-null-character"
10041004+ | Token.Character data ->
10051005+ reconstruct_active_formatting t;
10061006+ insert_character t data;
10071007+ if not (is_whitespace data) then t.frameset_ok <- false
10081008+ | Token.Comment data ->
10091009+ insert_comment t data
10101010+ | Token.Doctype _ ->
10111011+ parse_error t "unexpected-doctype"
10121012+ | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
10131013+ parse_error t "unexpected-start-tag";
10141014+ if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
10151015+ (* Find the html element (at the bottom of the stack) *)
10161016+ let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
10171017+ (match html_elem with
10181018+ | Some html ->
10191019+ List.iter (fun (k, v) ->
10201020+ if not (Dom.has_attr html k) then Dom.set_attr html k v
10211021+ ) attrs
10221022+ | None -> ())
10231023+ | Token.Tag { kind = Token.Start; name; _ }
10241024+ when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
10251025+ process_in_head t token
10261026+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
10271027+ process_in_head t token
10281028+ | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
10291029+ parse_error t "unexpected-start-tag";
10301030+ (* Find body element on stack - it should be near the end (html is last) *)
10311031+ let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in
10321032+ (match body with
10331033+ | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) ->
10341034+ t.frameset_ok <- false;
10351035+ List.iter (fun (k, v) ->
10361036+ if not (Dom.has_attr body k) then Dom.set_attr body k v
10371037+ ) attrs
10381038+ | _ -> ())
10391039+ | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
10401040+ if not t.frameset_ok then
10411041+ parse_error t "unexpected-start-tag-ignored"
10421042+ else begin
10431043+ (* Find body element on the stack *)
10441044+ let rec find_body_index idx = function
10451045+ | [] -> None
10461046+ | n :: rest ->
10471047+ if n.Dom.name = "body" then Some (idx, n)
10481048+ else find_body_index (idx + 1) rest
10491049+ in
10501050+ match find_body_index 0 t.open_elements with
10511051+ | None ->
10521052+ parse_error t "unexpected-start-tag-ignored"
10531053+ | Some (idx, body_elem) ->
10541054+ (* Remove body from its parent (the html element) *)
10551055+ (match body_elem.Dom.parent with
10561056+ | Some parent -> Dom.remove_child parent body_elem
10571057+ | None -> ());
10581058+ (* Pop all elements up to and including body - keep only elements after body_idx *)
10591059+ let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in
10601060+ t.open_elements <- drop (idx + 1) t.open_elements;
10611061+ (* Insert frameset element *)
10621062+ ignore (insert_element t "frameset" ~push:true attrs);
10631063+ t.mode <- Insertion_mode.In_frameset
10641064+ end
10651065+ | Token.EOF ->
10661066+ if t.template_modes <> [] then
10671067+ process_in_template t token
10681068+ else begin
10691069+ let has_unclosed = List.exists (fun n ->
10701070+ not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
10711071+ ) t.open_elements in
10721072+ if has_unclosed then parse_error t "expected-closing-tag-but-got-eof"
10731073+ end
10741074+ | Token.Tag { kind = Token.End; name = "body"; _ } ->
10751075+ if not (has_element_in_scope t "body") then
10761076+ parse_error t "unexpected-end-tag"
10771077+ else begin
10781078+ let has_unclosed = List.exists (fun n ->
10791079+ not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
10801080+ ) t.open_elements in
10811081+ if has_unclosed then parse_error t "end-tag-too-early";
10821082+ t.mode <- Insertion_mode.After_body
10831083+ end
10841084+ | Token.Tag { kind = Token.End; name = "html"; _ } ->
10851085+ if not (has_element_in_scope t "body") then
10861086+ parse_error t "unexpected-end-tag"
10871087+ else begin
10881088+ t.mode <- Insertion_mode.After_body;
10891089+ process_token t token
10901090+ end
10911091+ | Token.Tag { kind = Token.Start; name; attrs; _ }
10921092+ when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
10931093+ if has_element_in_button_scope t "p" then close_p_element t;
10941094+ ignore (insert_element t name ~push:true attrs)
10951095+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.heading_elements ->
10961096+ if has_element_in_button_scope t "p" then close_p_element t;
10971097+ (match current_node t with
10981098+ | Some n when List.mem n.Dom.name Constants.heading_elements ->
10991099+ parse_error t "unexpected-start-tag";
11001100+ pop_current t
11011101+ | _ -> ());
11021102+ ignore (insert_element t name ~push:true attrs)
11031103+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] ->
11041104+ if has_element_in_button_scope t "p" then close_p_element t;
11051105+ ignore (insert_element t name ~push:true attrs);
11061106+ t.ignore_lf <- true;
11071107+ t.frameset_ok <- false
11081108+ | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
11091109+ if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
11101110+ parse_error t "unexpected-start-tag"
11111111+ else begin
11121112+ if has_element_in_button_scope t "p" then close_p_element t;
11131113+ let form = insert_element t "form" attrs in
11141114+ t.open_elements <- form :: t.open_elements;
11151115+ if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
11161116+ t.form_element <- Some form
11171117+ end
11181118+ | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } ->
11191119+ t.frameset_ok <- false;
11201120+ let rec check = function
11211121+ | [] -> ()
11221122+ | n :: rest ->
11231123+ if n.Dom.name = "li" then begin
11241124+ generate_implied_end_tags t ~except:"li" ();
11251125+ (match current_node t with
11261126+ | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag"
11271127+ | _ -> ());
11281128+ pop_until_tag t "li"
11291129+ end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
11301130+ ()
11311131+ else
11321132+ check rest
11331133+ in
11341134+ check t.open_elements;
11351135+ if has_element_in_button_scope t "p" then close_p_element t;
11361136+ ignore (insert_element t "li" ~push:true attrs)
11371137+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] ->
11381138+ t.frameset_ok <- false;
11391139+ let rec check = function
11401140+ | [] -> ()
11411141+ | n :: rest ->
11421142+ if List.mem n.Dom.name ["dd"; "dt"] then begin
11431143+ generate_implied_end_tags t ~except:n.Dom.name ();
11441144+ (match current_node t with
11451145+ | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag"
11461146+ | _ -> ());
11471147+ pop_until_one_of t ["dd"; "dt"]
11481148+ end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
11491149+ ()
11501150+ else
11511151+ check rest
11521152+ in
11531153+ check t.open_elements;
11541154+ if has_element_in_button_scope t "p" then close_p_element t;
11551155+ ignore (insert_element t name ~push:true attrs)
11561156+ | Token.Tag { kind = Token.Start; name = "plaintext"; _ } ->
11571157+ if has_element_in_button_scope t "p" then close_p_element t;
11581158+ ignore (insert_element t "plaintext" ~push:true [])
11591159+ (* Tokenizer should switch to PLAINTEXT state *)
11601160+ | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } ->
11611161+ if has_element_in_scope t "button" then begin
11621162+ parse_error t "unexpected-start-tag";
11631163+ generate_implied_end_tags t ();
11641164+ pop_until_tag t "button"
11651165+ end;
11661166+ reconstruct_active_formatting t;
11671167+ ignore (insert_element t "button" ~push:true attrs);
11681168+ t.frameset_ok <- false
11691169+ | Token.Tag { kind = Token.End; name; _ }
11701170+ when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] ->
11711171+ if not (has_element_in_scope t name) then
11721172+ parse_error t "unexpected-end-tag"
11731173+ else begin
11741174+ generate_implied_end_tags t ();
11751175+ (match current_node t with
11761176+ | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
11771177+ | _ -> ());
11781178+ pop_until_tag t name
11791179+ end
11801180+ | Token.Tag { kind = Token.End; name = "form"; _ } ->
11811181+ if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
11821182+ let node = t.form_element in
11831183+ t.form_element <- None;
11841184+ match node with
11851185+ | None -> parse_error t "unexpected-end-tag"
11861186+ | Some form_node ->
11871187+ if not (has_element_in_scope t "form") then
11881188+ parse_error t "unexpected-end-tag"
11891189+ else begin
11901190+ generate_implied_end_tags t ();
11911191+ (match current_node t with
11921192+ | Some n when n != form_node -> parse_error t "end-tag-too-early"
11931193+ | _ -> ());
11941194+ t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements
11951195+ end
11961196+ end else begin
11971197+ if not (has_element_in_scope t "form") then
11981198+ parse_error t "unexpected-end-tag"
11991199+ else begin
12001200+ generate_implied_end_tags t ();
12011201+ (match current_node t with
12021202+ | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early"
12031203+ | _ -> ());
12041204+ pop_until_tag t "form"
12051205+ end
12061206+ end
12071207+ | Token.Tag { kind = Token.End; name = "p"; _ } ->
12081208+ if not (has_element_in_button_scope t "p") then begin
12091209+ parse_error t "unexpected-end-tag";
12101210+ ignore (insert_element t "p" ~push:true [])
12111211+ end;
12121212+ close_p_element t
12131213+ | Token.Tag { kind = Token.End; name = "li"; _ } ->
12141214+ if not (has_element_in_list_item_scope t "li") then
12151215+ parse_error t "unexpected-end-tag"
12161216+ else begin
12171217+ generate_implied_end_tags t ~except:"li" ();
12181218+ (match current_node t with
12191219+ | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early"
12201220+ | _ -> ());
12211221+ pop_until_tag t "li"
12221222+ end
12231223+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] ->
12241224+ if not (has_element_in_scope t name) then
12251225+ parse_error t "unexpected-end-tag"
12261226+ else begin
12271227+ generate_implied_end_tags t ~except:name ();
12281228+ (match current_node t with
12291229+ | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
12301230+ | _ -> ());
12311231+ pop_until_tag t name
12321232+ end
12331233+ | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.heading_elements ->
12341234+ if not (has_element_in_scope_impl t Constants.heading_elements Constants.default_scope ~check_integration_points:true) then
12351235+ parse_error t "unexpected-end-tag"
12361236+ else begin
12371237+ generate_implied_end_tags t ();
12381238+ (match current_node t with
12391239+ | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
12401240+ | _ -> ());
12411241+ pop_until_one_of t Constants.heading_elements
12421242+ end
12431243+ | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
12441244+ (* Check for existing <a> in active formatting *)
12451245+ let rec find_a = function
12461246+ | [] -> None
12471247+ | Marker :: _ -> None
12481248+ | Entry e :: _ when e.name = "a" -> Some e.node
12491249+ | _ :: rest -> find_a rest
12501250+ in
12511251+ (match find_a t.active_formatting with
12521252+ | Some existing ->
12531253+ parse_error t "unexpected-start-tag";
12541254+ adoption_agency t "a";
12551255+ t.active_formatting <- List.filter (function
12561256+ | Entry e -> e.node != existing
12571257+ | _ -> true
12581258+ ) t.active_formatting;
12591259+ t.open_elements <- List.filter (fun n -> n != existing) t.open_elements
12601260+ | None -> ());
12611261+ reconstruct_active_formatting t;
12621262+ let node = insert_element t "a" attrs in
12631263+ t.open_elements <- node :: t.open_elements;
12641264+ push_formatting_element t node "a" attrs
12651265+ | Token.Tag { kind = Token.Start; name; attrs; _ }
12661266+ when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
12671267+ reconstruct_active_formatting t;
12681268+ let node = insert_element t name attrs in
12691269+ t.open_elements <- node :: t.open_elements;
12701270+ push_formatting_element t node name attrs
12711271+ | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } ->
12721272+ if has_element_in_scope t "nobr" then begin
12731273+ parse_error t "unexpected-start-tag";
12741274+ adoption_agency t "nobr";
12751275+ (* Remove nobr from active formatting *)
12761276+ t.active_formatting <- List.filter (function
12771277+ | Entry e -> e.name <> "nobr"
12781278+ | Marker -> true
12791279+ ) t.active_formatting;
12801280+ (* Remove nobr from open elements *)
12811281+ t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements
12821282+ end;
12831283+ reconstruct_active_formatting t;
12841284+ let node = insert_element t "nobr" attrs in
12851285+ t.open_elements <- node :: t.open_elements;
12861286+ push_formatting_element t node "nobr" attrs
12871287+ | Token.Tag { kind = Token.End; name; _ }
12881288+ when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
12891289+ adoption_agency t name
12901290+ | Token.Tag { kind = Token.Start; name; attrs; _ }
12911291+ when List.mem name ["applet"; "marquee"; "object"] ->
12921292+ reconstruct_active_formatting t;
12931293+ ignore (insert_element t name ~push:true attrs);
12941294+ push_formatting_marker t;
12951295+ t.frameset_ok <- false
12961296+ | Token.Tag { kind = Token.End; name; _ }
12971297+ when List.mem name ["applet"; "marquee"; "object"] ->
12981298+ if not (has_element_in_scope t name) then
12991299+ parse_error t "unexpected-end-tag"
13001300+ else begin
13011301+ generate_implied_end_tags t ();
13021302+ (match current_node t with
13031303+ | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
13041304+ | _ -> ());
13051305+ pop_until_tag t name;
13061306+ clear_active_formatting_to_marker t
13071307+ end
13081308+ | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } ->
13091309+ if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then
13101310+ close_p_element t;
13111311+ ignore (insert_element t "table" ~push:true attrs);
13121312+ t.frameset_ok <- false;
13131313+ t.mode <- Insertion_mode.In_table
13141314+ | Token.Tag { kind = Token.End; name = "br"; _ } ->
13151315+ parse_error t "unexpected-end-tag";
13161316+ reconstruct_active_formatting t;
13171317+ ignore (insert_element t "br" ~push:true []);
13181318+ pop_current t;
13191319+ t.frameset_ok <- false
13201320+ | Token.Tag { kind = Token.Start; name; attrs; _ }
13211321+ when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] ->
13221322+ reconstruct_active_formatting t;
13231323+ ignore (insert_element t name ~push:true attrs);
13241324+ pop_current t;
13251325+ t.frameset_ok <- false
13261326+ | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
13271327+ reconstruct_active_formatting t;
13281328+ ignore (insert_element t "input" ~push:true attrs);
13291329+ pop_current t;
13301330+ let is_hidden = List.exists (fun (k, v) ->
13311331+ String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
13321332+ ) attrs in
13331333+ if not is_hidden then t.frameset_ok <- false
13341334+ | Token.Tag { kind = Token.Start; name; _ }
13351335+ when List.mem name ["param"; "source"; "track"] ->
13361336+ ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
13371337+ pop_current t
13381338+ | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
13391339+ if has_element_in_button_scope t "p" then close_p_element t;
13401340+ ignore (insert_element t "hr" ~push:true []);
13411341+ pop_current t;
13421342+ t.frameset_ok <- false
13431343+ | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } ->
13441344+ parse_error t "unexpected-start-tag";
13451345+ (* Treat <image> as <img> *)
13461346+ reconstruct_active_formatting t;
13471347+ ignore (insert_element t "img" ~push:true attrs);
13481348+ pop_current t;
13491349+ t.frameset_ok <- false
13501350+ | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } ->
13511351+ ignore (insert_element t "textarea" ~push:true attrs);
13521352+ t.ignore_lf <- true;
13531353+ t.original_mode <- Some t.mode;
13541354+ t.frameset_ok <- false;
13551355+ t.mode <- Insertion_mode.Text
13561356+ | Token.Tag { kind = Token.Start; name = "xmp"; _ } ->
13571357+ if has_element_in_button_scope t "p" then close_p_element t;
13581358+ reconstruct_active_formatting t;
13591359+ t.frameset_ok <- false;
13601360+ ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false });
13611361+ t.original_mode <- Some t.mode;
13621362+ t.mode <- Insertion_mode.Text
13631363+ | Token.Tag { kind = Token.Start; name = "iframe"; _ } ->
13641364+ t.frameset_ok <- false;
13651365+ ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false });
13661366+ t.original_mode <- Some t.mode;
13671367+ t.mode <- Insertion_mode.Text
13681368+ | Token.Tag { kind = Token.Start; name = "noembed"; _ } ->
13691369+ ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false });
13701370+ t.original_mode <- Some t.mode;
13711371+ t.mode <- Insertion_mode.Text
13721372+ | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
13731373+ reconstruct_active_formatting t;
13741374+ ignore (insert_element t "select" ~push:true attrs);
13751375+ t.frameset_ok <- false;
13761376+ if List.mem t.mode [Insertion_mode.In_table; Insertion_mode.In_caption; Insertion_mode.In_table_body; Insertion_mode.In_row; Insertion_mode.In_cell] then
13771377+ t.mode <- Insertion_mode.In_select_in_table
13781378+ else
13791379+ t.mode <- Insertion_mode.In_select
13801380+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
13811381+ (match current_node t with
13821382+ | Some n when n.Dom.name = "option" -> pop_current t
13831383+ | _ -> ());
13841384+ reconstruct_active_formatting t;
13851385+ ignore (insert_element t name ~push:true attrs)
13861386+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] ->
13871387+ if has_element_in_scope t "ruby" then begin
13881388+ generate_implied_end_tags t ()
13891389+ end;
13901390+ (match current_node t with
13911391+ | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
13921392+ | _ -> ());
13931393+ ignore (insert_element t name ~push:true attrs)
13941394+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] ->
13951395+ if has_element_in_scope t "ruby" then begin
13961396+ generate_implied_end_tags t ~except:"rtc" ()
13971397+ end;
13981398+ (match current_node t with
13991399+ | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
14001400+ | _ -> ());
14011401+ ignore (insert_element t name ~push:true attrs)
14021402+ | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
14031403+ reconstruct_active_formatting t;
14041404+ let adjusted_attrs = Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) in
14051405+ ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
14061406+ if self_closing then pop_current t
14071407+ | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
14081408+ reconstruct_active_formatting t;
14091409+ let adjusted_attrs = Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) in
14101410+ ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
14111411+ if self_closing then pop_current t
14121412+ | Token.Tag { kind = Token.Start; name; attrs; _ }
14131413+ when List.mem name ["col"; "frame"] ->
14141414+ (* In fragment context, insert these; otherwise ignore *)
14151415+ if t.fragment_context = None then
14161416+ parse_error t "unexpected-start-tag-ignored"
14171417+ else
14181418+ ignore (insert_element t name attrs)
14191419+ | Token.Tag { kind = Token.Start; name; _ }
14201420+ when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
14211421+ parse_error t "unexpected-start-tag"
14221422+ | Token.Tag { kind = Token.Start; name; attrs; _ } ->
14231423+ (* Any other start tag *)
14241424+ reconstruct_active_formatting t;
14251425+ ignore (insert_element t name ~push:true attrs)
14261426+ | Token.Tag { kind = Token.End; name; _ } ->
14271427+ (* Any other end tag *)
14281428+ let rec check = function
14291429+ | [] -> ()
14301430+ | node :: rest ->
14311431+ if node.Dom.name = name then begin
14321432+ generate_implied_end_tags t ~except:name ();
14331433+ (match current_node t with
14341434+ | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
14351435+ | _ -> ());
14361436+ pop_until t (fun n -> n == node)
14371437+ end else if is_special_element node then
14381438+ parse_error t "unexpected-end-tag"
14391439+ else
14401440+ check rest
14411441+ in
14421442+ check t.open_elements
14431443+14441444+and process_text t token =
14451445+ match token with
14461446+ | Token.Character data ->
14471447+ insert_character t data
14481448+ | Token.EOF ->
14491449+ parse_error t "expected-closing-tag-but-got-eof";
14501450+ pop_current t;
14511451+ t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body;
14521452+ process_token t token
14531453+ | Token.Tag { kind = Token.End; _ } ->
14541454+ pop_current t;
14551455+ t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body
14561456+ | _ -> ()
14571457+14581458+and process_in_table t token =
14591459+ match token with
14601460+ | Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) ->
14611461+ t.pending_table_chars <- [];
14621462+ t.original_mode <- Some t.mode;
14631463+ t.mode <- Insertion_mode.In_table_text;
14641464+ process_token t token
14651465+ | Token.Comment data ->
14661466+ insert_comment t data
14671467+ | Token.Doctype _ ->
14681468+ parse_error t "unexpected-doctype"
14691469+ | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } ->
14701470+ clear_stack_back_to_table_context t;
14711471+ push_formatting_marker t;
14721472+ ignore (insert_element t "caption" ~push:true attrs);
14731473+ t.mode <- Insertion_mode.In_caption
14741474+ | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
14751475+ clear_stack_back_to_table_context t;
14761476+ ignore (insert_element t "colgroup" ~push:true attrs);
14771477+ t.mode <- Insertion_mode.In_column_group
14781478+ | Token.Tag { kind = Token.Start; name = "col"; _ } ->
14791479+ clear_stack_back_to_table_context t;
14801480+ ignore (insert_element t "colgroup" ~push:true []);
14811481+ t.mode <- Insertion_mode.In_column_group;
14821482+ process_token t token
14831483+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
14841484+ clear_stack_back_to_table_context t;
14851485+ ignore (insert_element t name ~push:true attrs);
14861486+ t.mode <- Insertion_mode.In_table_body
14871487+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
14881488+ clear_stack_back_to_table_context t;
14891489+ ignore (insert_element t "tbody" ~push:true []);
14901490+ t.mode <- Insertion_mode.In_table_body;
14911491+ process_token t token
14921492+ | Token.Tag { kind = Token.Start; name = "table"; _ } ->
14931493+ parse_error t "unexpected-start-tag";
14941494+ if has_element_in_table_scope t "table" then begin
14951495+ pop_until_tag t "table";
14961496+ reset_insertion_mode t;
14971497+ process_token t token
14981498+ end
14991499+ | Token.Tag { kind = Token.End; name = "table"; _ } ->
15001500+ if not (has_element_in_table_scope t "table") then
15011501+ parse_error t "unexpected-end-tag"
15021502+ else begin
15031503+ pop_until_tag t "table";
15041504+ reset_insertion_mode t
15051505+ end
15061506+ | Token.Tag { kind = Token.End; name; _ }
15071507+ when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
15081508+ parse_error t "unexpected-end-tag"
15091509+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] ->
15101510+ process_in_head t token
15111511+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
15121512+ process_in_head t token
15131513+ | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
15141514+ let is_hidden = List.exists (fun (k, v) ->
15151515+ String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
15161516+ ) attrs in
15171517+ if not is_hidden then begin
15181518+ parse_error t "unexpected-start-tag";
15191519+ t.foster_parenting <- true;
15201520+ process_in_body t token;
15211521+ t.foster_parenting <- false
15221522+ end else begin
15231523+ parse_error t "unexpected-start-tag";
15241524+ ignore (insert_element t "input" ~push:true attrs);
15251525+ pop_current t
15261526+ end
15271527+ | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
15281528+ parse_error t "unexpected-start-tag";
15291529+ if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
15301530+ let form = insert_element t "form" attrs in
15311531+ t.open_elements <- form :: t.open_elements;
15321532+ t.form_element <- Some form;
15331533+ pop_current t
15341534+ end
15351535+ | Token.EOF ->
15361536+ process_in_body t token
15371537+ | _ ->
15381538+ parse_error t "unexpected-token-in-table";
15391539+ t.foster_parenting <- true;
15401540+ process_in_body t token;
15411541+ t.foster_parenting <- false
15421542+15431543+and clear_stack_back_to_table_context t =
15441544+ let rec loop () =
15451545+ match current_node t with
15461546+ | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) ->
15471547+ pop_current t;
15481548+ loop ()
15491549+ | _ -> ()
15501550+ in
15511551+ loop ()
15521552+15531553+and process_in_table_text t token =
15541554+ match token with
15551555+ | Token.Character data ->
15561556+ if String.contains data '\x00' then
15571557+ parse_error t "unexpected-null-character"
15581558+ else
15591559+ t.pending_table_chars <- data :: t.pending_table_chars
15601560+ | _ ->
15611561+ let pending = String.concat "" (List.rev t.pending_table_chars) in
15621562+ t.pending_table_chars <- [];
15631563+ if not (is_whitespace pending) then begin
15641564+ parse_error t "unexpected-character-in-table";
15651565+ t.foster_parenting <- true;
15661566+ reconstruct_active_formatting t;
15671567+ insert_character t pending;
15681568+ t.foster_parenting <- false
15691569+ end else
15701570+ insert_character t pending;
15711571+ t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_table;
15721572+ process_token t token
15731573+15741574+and process_in_caption t token =
15751575+ match token with
15761576+ | Token.Tag { kind = Token.End; name = "caption"; _ } ->
15771577+ if not (has_element_in_table_scope t "caption") then
15781578+ parse_error t "unexpected-end-tag"
15791579+ else begin
15801580+ generate_implied_end_tags t ();
15811581+ (match current_node t with
15821582+ | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early"
15831583+ | _ -> ());
15841584+ pop_until_tag t "caption";
15851585+ clear_active_formatting_to_marker t;
15861586+ t.mode <- Insertion_mode.In_table
15871587+ end
15881588+ | Token.Tag { kind = Token.Start; name; _ }
15891589+ when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
15901590+ if not (has_element_in_table_scope t "caption") then
15911591+ parse_error t "unexpected-start-tag"
15921592+ else begin
15931593+ generate_implied_end_tags t ();
15941594+ pop_until_tag t "caption";
15951595+ clear_active_formatting_to_marker t;
15961596+ t.mode <- Insertion_mode.In_table;
15971597+ process_token t token
15981598+ end
15991599+ | Token.Tag { kind = Token.End; name = "table"; _ } ->
16001600+ if not (has_element_in_table_scope t "caption") then
16011601+ parse_error t "unexpected-end-tag"
16021602+ else begin
16031603+ generate_implied_end_tags t ();
16041604+ pop_until_tag t "caption";
16051605+ clear_active_formatting_to_marker t;
16061606+ t.mode <- Insertion_mode.In_table;
16071607+ process_token t token
16081608+ end
16091609+ | Token.Tag { kind = Token.End; name; _ }
16101610+ when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
16111611+ parse_error t "unexpected-end-tag"
16121612+ | _ ->
16131613+ process_in_body t token
16141614+16151615+and process_in_column_group t token =
16161616+ match token with
16171617+ | Token.Character data when is_whitespace data ->
16181618+ insert_character t data
16191619+ | Token.Character data ->
16201620+ (* Split leading whitespace from non-whitespace *)
16211621+ let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in
16221622+ let len = String.length data in
16231623+ let ws_end = ref 0 in
16241624+ while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done;
16251625+ if !ws_end > 0 then
16261626+ insert_character t (String.sub data 0 !ws_end);
16271627+ if !ws_end < len then begin
16281628+ let remaining = String.sub data !ws_end (len - !ws_end) in
16291629+ (match current_node t with
16301630+ | Some n when n.Dom.name = "colgroup" ->
16311631+ pop_current t;
16321632+ t.mode <- Insertion_mode.In_table;
16331633+ process_token t (Token.Character remaining)
16341634+ | _ ->
16351635+ parse_error t "unexpected-token")
16361636+ end
16371637+ | Token.Comment data ->
16381638+ insert_comment t data
16391639+ | Token.Doctype _ ->
16401640+ parse_error t "unexpected-doctype"
16411641+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
16421642+ process_in_body t token
16431643+ | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } ->
16441644+ ignore (insert_element t "col" ~push:true attrs);
16451645+ pop_current t
16461646+ | Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
16471647+ (match current_node t with
16481648+ | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
16491649+ | Some _ -> pop_current t; t.mode <- Insertion_mode.In_table
16501650+ | None -> parse_error t "unexpected-end-tag")
16511651+ | Token.Tag { kind = Token.End; name = "col"; _ } ->
16521652+ parse_error t "unexpected-end-tag"
16531653+ | Token.Tag { kind = Token.Start; name = "template"; _ }
16541654+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
16551655+ process_in_head t token
16561656+ | Token.EOF ->
16571657+ process_in_body t token
16581658+ | _ ->
16591659+ (match current_node t with
16601660+ | Some n when n.Dom.name = "colgroup" ->
16611661+ pop_current t;
16621662+ t.mode <- Insertion_mode.In_table;
16631663+ process_token t token
16641664+ | _ ->
16651665+ parse_error t "unexpected-token")
16661666+16671667+and process_in_table_body t token =
16681668+ match token with
16691669+ | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
16701670+ clear_stack_back_to_table_body_context t;
16711671+ ignore (insert_element t "tr" ~push:true attrs);
16721672+ t.mode <- Insertion_mode.In_row
16731673+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
16741674+ parse_error t "unexpected-start-tag";
16751675+ clear_stack_back_to_table_body_context t;
16761676+ ignore (insert_element t "tr" ~push:true []);
16771677+ t.mode <- Insertion_mode.In_row;
16781678+ process_token t token
16791679+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
16801680+ if not (has_element_in_table_scope t name) then
16811681+ parse_error t "unexpected-end-tag"
16821682+ else begin
16831683+ clear_stack_back_to_table_body_context t;
16841684+ pop_current t;
16851685+ t.mode <- Insertion_mode.In_table
16861686+ end
16871687+ | Token.Tag { kind = Token.Start; name; _ }
16881688+ when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
16891689+ if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
16901690+ parse_error t "unexpected-start-tag"
16911691+ else begin
16921692+ clear_stack_back_to_table_body_context t;
16931693+ pop_current t;
16941694+ t.mode <- Insertion_mode.In_table;
16951695+ process_token t token
16961696+ end
16971697+ | Token.Tag { kind = Token.End; name = "table"; _ } ->
16981698+ if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
16991699+ parse_error t "unexpected-end-tag"
17001700+ else begin
17011701+ clear_stack_back_to_table_body_context t;
17021702+ pop_current t;
17031703+ t.mode <- Insertion_mode.In_table;
17041704+ process_token t token
17051705+ end
17061706+ | Token.Tag { kind = Token.End; name; _ }
17071707+ when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] ->
17081708+ parse_error t "unexpected-end-tag"
17091709+ | _ ->
17101710+ process_in_table t token
17111711+17121712+and clear_stack_back_to_table_body_context t =
17131713+ let rec loop () =
17141714+ match current_node t with
17151715+ | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) ->
17161716+ pop_current t;
17171717+ loop ()
17181718+ | _ -> ()
17191719+ in
17201720+ loop ()
17211721+17221722+and process_in_row t token =
17231723+ match token with
17241724+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
17251725+ clear_stack_back_to_table_row_context t;
17261726+ ignore (insert_element t name ~push:true attrs);
17271727+ t.mode <- Insertion_mode.In_cell;
17281728+ push_formatting_marker t
17291729+ | Token.Tag { kind = Token.End; name = "tr"; _ } ->
17301730+ if not (has_element_in_table_scope t "tr") then
17311731+ parse_error t "unexpected-end-tag"
17321732+ else begin
17331733+ clear_stack_back_to_table_row_context t;
17341734+ pop_current t;
17351735+ t.mode <- Insertion_mode.In_table_body
17361736+ end
17371737+ | Token.Tag { kind = Token.Start; name; _ }
17381738+ when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
17391739+ if not (has_element_in_table_scope t "tr") then
17401740+ parse_error t "unexpected-start-tag"
17411741+ else begin
17421742+ clear_stack_back_to_table_row_context t;
17431743+ pop_current t;
17441744+ t.mode <- Insertion_mode.In_table_body;
17451745+ process_token t token
17461746+ end
17471747+ | Token.Tag { kind = Token.End; name = "table"; _ } ->
17481748+ if not (has_element_in_table_scope t "tr") then
17491749+ parse_error t "unexpected-end-tag"
17501750+ else begin
17511751+ clear_stack_back_to_table_row_context t;
17521752+ pop_current t;
17531753+ t.mode <- Insertion_mode.In_table_body;
17541754+ process_token t token
17551755+ end
17561756+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
17571757+ if not (has_element_in_table_scope t name) then
17581758+ parse_error t "unexpected-end-tag"
17591759+ else if not (has_element_in_table_scope t "tr") then
17601760+ parse_error t "unexpected-end-tag"
17611761+ else begin
17621762+ clear_stack_back_to_table_row_context t;
17631763+ pop_current t;
17641764+ t.mode <- Insertion_mode.In_table_body;
17651765+ process_token t token
17661766+ end
17671767+ | Token.Tag { kind = Token.End; name; _ }
17681768+ when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] ->
17691769+ parse_error t "unexpected-end-tag"
17701770+ | _ ->
17711771+ process_in_table t token
17721772+17731773+and clear_stack_back_to_table_row_context t =
17741774+ let rec loop () =
17751775+ match current_node t with
17761776+ | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) ->
17771777+ pop_current t;
17781778+ loop ()
17791779+ | _ -> ()
17801780+ in
17811781+ loop ()
17821782+17831783+and process_in_cell t token =
17841784+ match token with
17851785+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] ->
17861786+ if not (has_element_in_table_scope t name) then
17871787+ parse_error t "unexpected-end-tag"
17881788+ else begin
17891789+ generate_implied_end_tags t ();
17901790+ (match current_node t with
17911791+ | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
17921792+ | _ -> ());
17931793+ pop_until_html_tag t name;
17941794+ clear_active_formatting_to_marker t;
17951795+ t.mode <- Insertion_mode.In_row
17961796+ end
17971797+ | Token.Tag { kind = Token.Start; name; _ }
17981798+ when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
17991799+ if not (has_element_in_scope_impl t ["td"; "th"] Constants.table_scope ~check_integration_points:false) then
18001800+ parse_error t "unexpected-start-tag"
18011801+ else begin
18021802+ close_cell t;
18031803+ process_token t token
18041804+ end
18051805+ | Token.Tag { kind = Token.End; name; _ }
18061806+ when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
18071807+ parse_error t "unexpected-end-tag"
18081808+ | Token.Tag { kind = Token.End; name; _ }
18091809+ when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] ->
18101810+ if not (has_element_in_table_scope t name) then
18111811+ parse_error t "unexpected-end-tag"
18121812+ else begin
18131813+ close_cell t;
18141814+ process_token t token
18151815+ end
18161816+ | _ ->
18171817+ process_in_body t token
18181818+18191819+and close_cell t =
18201820+ generate_implied_end_tags t ();
18211821+ (match current_node t with
18221822+ | Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
18231823+ | _ -> ());
18241824+ pop_until_html_one_of t ["td"; "th"];
18251825+ clear_active_formatting_to_marker t;
18261826+ t.mode <- Insertion_mode.In_row
18271827+18281828+and process_in_select t token =
18291829+ match token with
18301830+ | Token.Character "\x00" ->
18311831+ parse_error t "unexpected-null-character"
18321832+ | Token.Character data ->
18331833+ reconstruct_active_formatting t;
18341834+ insert_character t data
18351835+ | Token.Comment data ->
18361836+ insert_comment t data
18371837+ | Token.Doctype _ ->
18381838+ parse_error t "unexpected-doctype"
18391839+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
18401840+ process_in_body t token
18411841+ | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } ->
18421842+ (match current_node t with
18431843+ | Some n when n.Dom.name = "option" -> pop_current t
18441844+ | _ -> ());
18451845+ reconstruct_active_formatting t;
18461846+ ignore (insert_element t "option" ~push:true attrs)
18471847+ | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } ->
18481848+ (match current_node t with
18491849+ | Some n when n.Dom.name = "option" -> pop_current t
18501850+ | _ -> ());
18511851+ (match current_node t with
18521852+ | Some n when n.Dom.name = "optgroup" -> pop_current t
18531853+ | _ -> ());
18541854+ reconstruct_active_formatting t;
18551855+ ignore (insert_element t "optgroup" ~push:true attrs)
18561856+ | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
18571857+ (match current_node t with
18581858+ | Some n when n.Dom.name = "option" -> pop_current t
18591859+ | _ -> ());
18601860+ (match current_node t with
18611861+ | Some n when n.Dom.name = "optgroup" -> pop_current t
18621862+ | _ -> ());
18631863+ ignore (insert_element t "hr" ~push:true []);
18641864+ pop_current t
18651865+ | Token.Tag { kind = Token.End; name = "optgroup"; _ } ->
18661866+ (match t.open_elements with
18671867+ | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" ->
18681868+ pop_current t
18691869+ | _ -> ());
18701870+ (match current_node t with
18711871+ | Some n when n.Dom.name = "optgroup" -> pop_current t
18721872+ | _ -> parse_error t "unexpected-end-tag")
18731873+ | Token.Tag { kind = Token.End; name = "option"; _ } ->
18741874+ (match current_node t with
18751875+ | Some n when n.Dom.name = "option" -> pop_current t
18761876+ | _ -> parse_error t "unexpected-end-tag")
18771877+ | Token.Tag { kind = Token.End; name = "select"; _ } ->
18781878+ if not (has_element_in_select_scope t "select") then
18791879+ parse_error t "unexpected-end-tag"
18801880+ else begin
18811881+ pop_until_tag t "select";
18821882+ reset_insertion_mode t
18831883+ end
18841884+ | Token.Tag { kind = Token.Start; name = "select"; _ } ->
18851885+ parse_error t "unexpected-start-tag";
18861886+ (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
18871887+ pop_until_tag t "select";
18881888+ reset_insertion_mode t
18891889+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] ->
18901890+ parse_error t "unexpected-start-tag";
18911891+ (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
18921892+ pop_until_tag t "select";
18931893+ reset_insertion_mode t;
18941894+ process_token t token
18951895+ | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } ->
18961896+ (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *)
18971897+ reconstruct_active_formatting t;
18981898+ ignore (insert_element t "plaintext" ~push:true attrs)
18991899+ | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } ->
19001900+ (* menuitem is allowed in select *)
19011901+ reconstruct_active_formatting t;
19021902+ ignore (insert_element t "menuitem" ~push:true attrs)
19031903+ | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } ->
19041904+ (* keygen is handled specially in select - inserted directly *)
19051905+ reconstruct_active_formatting t;
19061906+ ignore (insert_element t "keygen" attrs)
19071907+ (* Void element, don't push to stack *)
19081908+ | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
19091909+ reconstruct_active_formatting t;
19101910+ let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in
19111911+ if not self_closing then t.open_elements <- node :: t.open_elements
19121912+ | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
19131913+ reconstruct_active_formatting t;
19141914+ let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in
19151915+ if not self_closing then t.open_elements <- node :: t.open_elements
19161916+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] ->
19171917+ process_in_head t token
19181918+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
19191919+ process_in_head t token
19201920+ (* Allow certain HTML elements in select - newer spec behavior *)
19211921+ | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
19221922+ reconstruct_active_formatting t;
19231923+ let node = insert_element t name attrs in
19241924+ if not self_closing then t.open_elements <- node :: t.open_elements
19251925+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] ->
19261926+ reconstruct_active_formatting t;
19271927+ ignore (insert_element t name attrs)
19281928+ (* Don't push to stack - void elements *)
19291929+ (* Handle formatting elements in select *)
19301930+ | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.formatting_elements ->
19311931+ reconstruct_active_formatting t;
19321932+ let node = insert_element t name ~push:true attrs in
19331933+ push_formatting_element t node name attrs
19341934+ | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.formatting_elements ->
19351935+ (* Find select element and check if formatting element is inside select *)
19361936+ let select_idx = ref None in
19371937+ let fmt_idx = ref None in
19381938+ List.iteri (fun i n ->
19391939+ if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
19401940+ if n.Dom.name = name then fmt_idx := Some i
19411941+ ) t.open_elements;
19421942+ (match !fmt_idx, !select_idx with
19431943+ | Some fi, Some si when fi < si ->
19441944+ (* Formatting element is inside select, run adoption agency *)
19451945+ adoption_agency t name
19461946+ | Some _, Some _ ->
19471947+ (* Formatting element is outside select boundary - parse error, ignore *)
19481948+ parse_error t "unexpected-end-tag"
19491949+ | Some _, None ->
19501950+ adoption_agency t name
19511951+ | None, _ ->
19521952+ parse_error t "unexpected-end-tag")
19531953+ (* End tags for HTML elements allowed in select *)
19541954+ | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
19551955+ (* Find select and target indices *)
19561956+ let select_idx = ref None in
19571957+ let target_idx = ref None in
19581958+ List.iteri (fun i n ->
19591959+ if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
19601960+ if n.Dom.name = name then target_idx := Some i
19611961+ ) t.open_elements;
19621962+ (* Only pop if target exists and is inside select (lower index = closer to current) *)
19631963+ (match !target_idx, !select_idx with
19641964+ | Some ti, Some si when ti < si ->
19651965+ (* Pop until we reach the target *)
19661966+ let rec pop_to_target () =
19671967+ match t.open_elements with
19681968+ | [] -> ()
19691969+ | n :: rest ->
19701970+ t.open_elements <- rest;
19711971+ if n.Dom.name <> name then pop_to_target ()
19721972+ in
19731973+ pop_to_target ()
19741974+ | Some _, Some _ ->
19751975+ parse_error t "unexpected-end-tag"
19761976+ | Some _, None ->
19771977+ (* No select on stack, just pop to target *)
19781978+ let rec pop_to_target () =
19791979+ match t.open_elements with
19801980+ | [] -> ()
19811981+ | n :: rest ->
19821982+ t.open_elements <- rest;
19831983+ if n.Dom.name <> name then pop_to_target ()
19841984+ in
19851985+ pop_to_target ()
19861986+ | None, _ ->
19871987+ parse_error t "unexpected-end-tag")
19881988+ | Token.EOF ->
19891989+ process_in_body t token
19901990+ | _ ->
19911991+ parse_error t "unexpected-token-in-select"
19921992+19931993+and process_in_select_in_table t token =
19941994+ match token with
19951995+ | Token.Tag { kind = Token.Start; name; _ }
19961996+ when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
19971997+ parse_error t "unexpected-start-tag";
19981998+ pop_until_tag t "select";
19991999+ reset_insertion_mode t;
20002000+ process_token t token
20012001+ | Token.Tag { kind = Token.End; name; _ }
20022002+ when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
20032003+ parse_error t "unexpected-end-tag";
20042004+ if has_element_in_table_scope t name then begin
20052005+ pop_until_tag t "select";
20062006+ reset_insertion_mode t;
20072007+ process_token t token
20082008+ end
20092009+ | _ ->
20102010+ process_in_select t token
20112011+20122012+and process_in_template t token =
20132013+ match token with
20142014+ | Token.Character _ | Token.Comment _ | Token.Doctype _ ->
20152015+ process_in_body t token
20162016+ | Token.Tag { kind = Token.Start; name; _ }
20172017+ when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
20182018+ process_in_head t token
20192019+ | Token.Tag { kind = Token.End; name = "template"; _ } ->
20202020+ process_in_head t token
20212021+ | Token.Tag { kind = Token.Start; name; _ }
20222022+ when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
20232023+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20242024+ t.template_modes <- Insertion_mode.In_table :: t.template_modes;
20252025+ t.mode <- Insertion_mode.In_table;
20262026+ process_token t token
20272027+ | Token.Tag { kind = Token.Start; name = "col"; _ } ->
20282028+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20292029+ t.template_modes <- Insertion_mode.In_column_group :: t.template_modes;
20302030+ t.mode <- Insertion_mode.In_column_group;
20312031+ process_token t token
20322032+ | Token.Tag { kind = Token.Start; name = "tr"; _ } ->
20332033+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20342034+ t.template_modes <- Insertion_mode.In_table_body :: t.template_modes;
20352035+ t.mode <- Insertion_mode.In_table_body;
20362036+ process_token t token
20372037+ | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] ->
20382038+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20392039+ t.template_modes <- Insertion_mode.In_row :: t.template_modes;
20402040+ t.mode <- Insertion_mode.In_row;
20412041+ process_token t token
20422042+ | Token.Tag { kind = Token.Start; _ } ->
20432043+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20442044+ t.template_modes <- Insertion_mode.In_body :: t.template_modes;
20452045+ t.mode <- Insertion_mode.In_body;
20462046+ process_token t token
20472047+ | Token.Tag { kind = Token.End; _ } ->
20482048+ parse_error t "unexpected-end-tag"
20492049+ | Token.EOF ->
20502050+ if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
20512051+ () (* Stop parsing *)
20522052+ else begin
20532053+ parse_error t "expected-closing-tag-but-got-eof";
20542054+ pop_until_html_tag t "template";
20552055+ clear_active_formatting_to_marker t;
20562056+ t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
20572057+ reset_insertion_mode t;
20582058+ process_token t token
20592059+ end
20602060+20612061+and process_after_body t token =
20622062+ match token with
20632063+ | Token.Character data when is_whitespace data ->
20642064+ process_in_body t token
20652065+ | Token.Comment data ->
20662066+ (* Insert as last child of html element - html is at bottom of stack *)
20672067+ let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
20682068+ (match html_opt with
20692069+ | Some html -> Dom.append_child html (Dom.create_comment data)
20702070+ | None -> ())
20712071+ | Token.Doctype _ ->
20722072+ parse_error t "unexpected-doctype"
20732073+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
20742074+ process_in_body t token
20752075+ | Token.Tag { kind = Token.End; name = "html"; _ } ->
20762076+ if t.fragment_context <> None then
20772077+ parse_error t "unexpected-end-tag"
20782078+ else
20792079+ t.mode <- Insertion_mode.After_after_body
20802080+ | Token.EOF ->
20812081+ () (* Stop parsing *)
20822082+ | _ ->
20832083+ parse_error t "unexpected-token-after-body";
20842084+ t.mode <- Insertion_mode.In_body;
20852085+ process_token t token
20862086+20872087+and process_in_frameset t token =
20882088+ match token with
20892089+ | Token.Character data ->
20902090+ (* Extract only whitespace characters and insert them *)
20912091+ let whitespace = String.to_seq data
20922092+ |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
20932093+ |> String.of_seq in
20942094+ if whitespace <> "" then insert_character t whitespace;
20952095+ if not (is_whitespace data) then
20962096+ parse_error t "unexpected-char-in-frameset"
20972097+ | Token.Comment data ->
20982098+ insert_comment t data
20992099+ | Token.Doctype _ ->
21002100+ parse_error t "unexpected-doctype"
21012101+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
21022102+ process_in_body t token
21032103+ | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
21042104+ ignore (insert_element t "frameset" ~push:true attrs)
21052105+ | Token.Tag { kind = Token.End; name = "frameset"; _ } ->
21062106+ (match current_node t with
21072107+ | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag"
21082108+ | _ ->
21092109+ pop_current t;
21102110+ if t.fragment_context = None then
21112111+ (match current_node t with
21122112+ | Some n when n.Dom.name <> "frameset" -> t.mode <- Insertion_mode.After_frameset
21132113+ | _ -> ()))
21142114+ | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
21152115+ ignore (insert_element t "frame" ~push:true attrs);
21162116+ pop_current t
21172117+ | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
21182118+ process_in_head t token
21192119+ | Token.EOF ->
21202120+ (match current_node t with
21212121+ | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof"
21222122+ | _ -> ())
21232123+ | _ ->
21242124+ parse_error t "unexpected-token-in-frameset"
21252125+21262126+and process_after_frameset t token =
21272127+ match token with
21282128+ | Token.Character data ->
21292129+ (* Extract only whitespace characters and insert them *)
21302130+ let whitespace = String.to_seq data
21312131+ |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
21322132+ |> String.of_seq in
21332133+ if whitespace <> "" then insert_character t whitespace;
21342134+ if not (is_whitespace data) then
21352135+ parse_error t "unexpected-char-after-frameset"
21362136+ | Token.Comment data ->
21372137+ insert_comment t data
21382138+ | Token.Doctype _ ->
21392139+ parse_error t "unexpected-doctype"
21402140+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
21412141+ process_in_body t token
21422142+ | Token.Tag { kind = Token.End; name = "html"; _ } ->
21432143+ t.mode <- Insertion_mode.After_after_frameset
21442144+ | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
21452145+ process_in_head t token
21462146+ | Token.EOF ->
21472147+ () (* Stop parsing *)
21482148+ | _ ->
21492149+ parse_error t "unexpected-token-after-frameset"
21502150+21512151+and process_after_after_body t token =
21522152+ match token with
21532153+ | Token.Comment data ->
21542154+ insert_comment_to_document t data
21552155+ | Token.Doctype _ ->
21562156+ process_in_body t token
21572157+ | Token.Character data when is_whitespace data ->
21582158+ process_in_body t token
21592159+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
21602160+ process_in_body t token
21612161+ | Token.EOF ->
21622162+ () (* Stop parsing *)
21632163+ | _ ->
21642164+ parse_error t "unexpected-token-after-after-body";
21652165+ t.mode <- Insertion_mode.In_body;
21662166+ process_token t token
21672167+21682168+and process_after_after_frameset t token =
21692169+ match token with
21702170+ | Token.Comment data ->
21712171+ insert_comment_to_document t data
21722172+ | Token.Doctype _ ->
21732173+ process_in_body t token
21742174+ | Token.Character data ->
21752175+ (* Extract only whitespace characters and process using in_body rules *)
21762176+ let whitespace = String.to_seq data
21772177+ |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
21782178+ |> String.of_seq in
21792179+ if whitespace <> "" then process_in_body t (Token.Character whitespace);
21802180+ if not (is_whitespace data) then
21812181+ parse_error t "unexpected-char-after-after-frameset"
21822182+ | Token.Tag { kind = Token.Start; name = "html"; _ } ->
21832183+ process_in_body t token
21842184+ | Token.EOF ->
21852185+ () (* Stop parsing *)
21862186+ | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
21872187+ process_in_head t token
21882188+ | _ ->
21892189+ parse_error t "unexpected-token-after-after-frameset"
21902190+21912191+and process_token t token =
21922192+ (* Check for HTML integration points (SVG foreignObject, desc, title) *)
21932193+ let is_html_integration_point node =
21942194+ (* SVG foreignObject, desc, and title are always HTML integration points *)
21952195+ if node.Dom.namespace = Some "svg" &&
21962196+ List.mem node.Dom.name Constants.svg_html_integration then true
21972197+ (* annotation-xml is an HTML integration point only with specific encoding values *)
21982198+ else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
21992199+ match List.assoc_opt "encoding" node.Dom.attrs with
22002200+ | Some enc ->
22012201+ let enc_lower = String.lowercase_ascii enc in
22022202+ enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
22032203+ | None -> false
22042204+ else false
22052205+ in
22062206+ (* Check for MathML text integration points *)
22072207+ let is_mathml_text_integration_point node =
22082208+ node.Dom.namespace = Some "mathml" &&
22092209+ List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
22102210+ in
22112211+ (* Foreign content handling *)
22122212+ let in_foreign =
22132213+ match adjusted_current_node t with
22142214+ | None -> false
22152215+ | Some node ->
22162216+ if is_in_html_namespace node then false
22172217+ else begin
22182218+ (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
22192219+ if is_html_integration_point node then begin
22202220+ match token with
22212221+ | Token.Character _ -> false
22222222+ | Token.Tag { kind = Token.Start; _ } -> false
22232223+ | _ -> true
22242224+ end
22252225+ (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
22262226+ else if is_mathml_text_integration_point node then begin
22272227+ match token with
22282228+ | Token.Character _ -> false
22292229+ | Token.Tag { kind = Token.Start; name; _ } ->
22302230+ name = "mglyph" || name = "malignmark"
22312231+ | _ -> true
22322232+ end
22332233+ (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *)
22342234+ else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin
22352235+ match token with
22362236+ | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false
22372237+ | _ -> true
22382238+ end
22392239+ (* Not at integration point - use foreign content rules *)
22402240+ (* Breakout handling is done inside process_foreign_content *)
22412241+ else true
22422242+ end
22432243+ in
22442244+22452245+ (* Check if at HTML integration point for special table mode handling *)
22462246+ let at_integration_point =
22472247+ match adjusted_current_node t with
22482248+ | Some node ->
22492249+ is_html_integration_point node || is_mathml_text_integration_point node
22502250+ | None -> false
22512251+ in
22522252+22532253+ if in_foreign then
22542254+ process_foreign_content t token
22552255+ else if at_integration_point then begin
22562256+ (* At integration points, check if in table mode without table in scope *)
22572257+ let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in
22582258+ let has_table = has_element_in_table_scope t "table" in
22592259+ if is_table_mode && not has_table then begin
22602260+ match token with
22612261+ | Token.Tag { kind = Token.Start; _ } ->
22622262+ (* Temporarily use IN_BODY for start tags in table mode without table *)
22632263+ let saved_mode = t.mode in
22642264+ t.mode <- In_body;
22652265+ process_by_mode t token;
22662266+ if t.mode = In_body then t.mode <- saved_mode
22672267+ | _ -> process_by_mode t token
22682268+ end else
22692269+ process_by_mode t token
22702270+ end else
22712271+ process_by_mode t token
22722272+22732273+(* Pop foreign elements until HTML or integration point *)
22742274+and pop_until_html_or_integration_point t =
22752275+ let is_html_integration_point node =
22762276+ (* SVG foreignObject, desc, and title are always HTML integration points *)
22772277+ if node.Dom.namespace = Some "svg" &&
22782278+ List.mem node.Dom.name Constants.svg_html_integration then true
22792279+ (* annotation-xml is an HTML integration point only with specific encoding values *)
22802280+ else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
22812281+ match List.assoc_opt "encoding" node.Dom.attrs with
22822282+ | Some enc ->
22832283+ let enc_lower = String.lowercase_ascii enc in
22842284+ enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
22852285+ | None -> false
22862286+ else false
22872287+ in
22882288+ (* Get fragment context element - only for foreign namespace fragment contexts *)
22892289+ let fragment_context_elem = t.fragment_context_element in
22902290+ let rec pop () =
22912291+ match current_node t with
22922292+ | None -> ()
22932293+ | Some node ->
22942294+ if is_in_html_namespace node then ()
22952295+ else if is_html_integration_point node then ()
22962296+ (* Don't pop past fragment context element *)
22972297+ else (match fragment_context_elem with
22982298+ | Some ctx when node == ctx -> ()
22992299+ | _ ->
23002300+ pop_current t;
23012301+ pop ())
23022302+ in
23032303+ pop ()
23042304+23052305+(* Foreign breakout elements - these break out of foreign content *)
23062306+and is_foreign_breakout_element name =
23072307+ List.mem (String.lowercase_ascii name)
23082308+ ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
23092309+ "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
23102310+ "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
23112311+ "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"]
23122312+23132313+and process_foreign_content t token =
23142314+ match token with
23152315+ | Token.Character "\x00" ->
23162316+ parse_error t "unexpected-null-character";
23172317+ insert_character t "\xEF\xBF\xBD"
23182318+ | Token.Character data when is_whitespace data ->
23192319+ insert_character t data
23202320+ | Token.Character data ->
23212321+ insert_character t data;
23222322+ t.frameset_ok <- false
23232323+ | Token.Comment data ->
23242324+ insert_comment t data
23252325+ | Token.Doctype _ ->
23262326+ parse_error t "unexpected-doctype"
23272327+ | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name ->
23282328+ (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *)
23292329+ parse_error t "unexpected-html-element-in-foreign-content";
23302330+ pop_until_html_or_integration_point t;
23312331+ reset_insertion_mode t;
23322332+ (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
23332333+ process_by_mode t token
23342334+ | Token.Tag { kind = Token.Start; name = "font"; attrs; _ }
23352335+ when List.exists (fun (n, _) ->
23362336+ let n = String.lowercase_ascii n in
23372337+ n = "color" || n = "face" || n = "size") attrs ->
23382338+ (* font with color/face/size breaks out of foreign content *)
23392339+ parse_error t "unexpected-html-element-in-foreign-content";
23402340+ pop_until_html_or_integration_point t;
23412341+ reset_insertion_mode t;
23422342+ process_by_mode t token
23432343+ | Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
23442344+ let name =
23452345+ match adjusted_current_node t with
23462346+ | Some n when n.Dom.namespace = Some "svg" -> Constants.adjust_svg_tag_name name
23472347+ | _ -> name
23482348+ in
23492349+ let attrs =
23502350+ match adjusted_current_node t with
23512351+ | Some n when n.Dom.namespace = Some "svg" ->
23522352+ Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs)
23532353+ | Some n when n.Dom.namespace = Some "mathml" ->
23542354+ Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs)
23552355+ | _ -> Constants.adjust_foreign_attrs attrs
23562356+ in
23572357+ let namespace =
23582358+ match adjusted_current_node t with
23592359+ | Some n -> n.Dom.namespace
23602360+ | None -> None
23612361+ in
23622362+ let node = insert_element t name ~namespace attrs in
23632363+ t.open_elements <- node :: t.open_elements;
23642364+ if self_closing then pop_current t
23652365+ | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] ->
23662366+ (* Special case: </br> and </p> end tags trigger breakout from foreign content *)
23672367+ parse_error t "unexpected-html-element-in-foreign-content";
23682368+ pop_until_html_or_integration_point t;
23692369+ reset_insertion_mode t;
23702370+ (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
23712371+ process_by_mode t token
23722372+ | Token.Tag { kind = Token.End; name; _ } ->
23732373+ (* Find matching element per WHATWG spec for foreign content *)
23742374+ let is_fragment_context n =
23752375+ match t.fragment_context_element with
23762376+ | Some ctx -> n == ctx
23772377+ | None -> false
23782378+ in
23792379+ let name_lower = String.lowercase_ascii name in
23802380+ (* Walk through stack looking for matching element *)
23812381+ let rec find_and_process first_node idx = function
23822382+ | [] -> () (* Stack exhausted - ignore tag *)
23832383+ | n :: rest ->
23842384+ let node_name_lower = String.lowercase_ascii n.Dom.name in
23852385+ let is_html = is_in_html_namespace n in
23862386+ let name_matches = node_name_lower = name_lower in
23872387+23882388+ (* If first node doesn't match tag name, it's a parse error *)
23892389+ if first_node && not name_matches then
23902390+ parse_error t "unexpected-end-tag-in-foreign-content";
23912391+23922392+ (* Check if this node matches the end tag *)
23932393+ if name_matches then begin
23942394+ (* Fragment context check *)
23952395+ if is_fragment_context n then
23962396+ parse_error t "unexpected-end-tag-in-fragment-context"
23972397+ (* If matched element is in HTML namespace, reprocess via HTML mode *)
23982398+ else if is_html then
23992399+ process_by_mode t token
24002400+ (* Otherwise it's a foreign element - pop everything from this point up *)
24012401+ else begin
24022402+ (* Pop all elements from current down to and including the matched element *)
24032403+ let rec pop_to_idx current_idx =
24042404+ if current_idx >= idx then begin
24052405+ pop_current t;
24062406+ pop_to_idx (current_idx - 1)
24072407+ end
24082408+ in
24092409+ pop_to_idx (List.length t.open_elements - 1)
24102410+ end
24112411+ end
24122412+ (* If we hit an HTML element that doesn't match, process via HTML mode *)
24132413+ else if is_html then
24142414+ process_by_mode t token
24152415+ (* Continue searching in the stack *)
24162416+ else
24172417+ find_and_process false (idx - 1) rest
24182418+ in
24192419+ find_and_process true (List.length t.open_elements - 1) t.open_elements
24202420+ | Token.EOF ->
24212421+ process_by_mode t token
24222422+24232423+and process_by_mode t token =
24242424+ match t.mode with
24252425+ | Insertion_mode.Initial -> process_initial t token
24262426+ | Insertion_mode.Before_html -> process_before_html t token
24272427+ | Insertion_mode.Before_head -> process_before_head t token
24282428+ | Insertion_mode.In_head -> process_in_head t token
24292429+ | Insertion_mode.In_head_noscript -> process_in_head_noscript t token
24302430+ | Insertion_mode.After_head -> process_after_head t token
24312431+ | Insertion_mode.In_body -> process_in_body t token
24322432+ | Insertion_mode.Text -> process_text t token
24332433+ | Insertion_mode.In_table -> process_in_table t token
24342434+ | Insertion_mode.In_table_text -> process_in_table_text t token
24352435+ | Insertion_mode.In_caption -> process_in_caption t token
24362436+ | Insertion_mode.In_column_group -> process_in_column_group t token
24372437+ | Insertion_mode.In_table_body -> process_in_table_body t token
24382438+ | Insertion_mode.In_row -> process_in_row t token
24392439+ | Insertion_mode.In_cell -> process_in_cell t token
24402440+ | Insertion_mode.In_select -> process_in_select t token
24412441+ | Insertion_mode.In_select_in_table -> process_in_select_in_table t token
24422442+ | Insertion_mode.In_template -> process_in_template t token
24432443+ | Insertion_mode.After_body -> process_after_body t token
24442444+ | Insertion_mode.In_frameset -> process_in_frameset t token
24452445+ | Insertion_mode.After_frameset -> process_after_frameset t token
24462446+ | Insertion_mode.After_after_body -> process_after_after_body t token
24472447+ | Insertion_mode.After_after_frameset -> process_after_after_frameset t token
24482448+24492449+(* Populate selectedcontent elements with content from selected option *)
24502450+let find_elements name node =
24512451+ let result = ref [] in
24522452+ let rec find n =
24532453+ if n.Dom.name = name then result := n :: !result;
24542454+ List.iter find n.Dom.children
24552455+ in
24562456+ find node;
24572457+ List.rev !result (* Reverse to maintain document order *)
24582458+24592459+let find_element name node =
24602460+ let rec find n =
24612461+ if n.Dom.name = name then Some n
24622462+ else
24632463+ List.find_map find n.Dom.children
24642464+ in
24652465+ find node
24662466+24672467+let populate_selectedcontent document =
24682468+ let selects = find_elements "select" document in
24692469+ List.iter (fun select ->
24702470+ match find_element "selectedcontent" select with
24712471+ | None -> ()
24722472+ | Some selectedcontent ->
24732473+ let options = find_elements "option" select in
24742474+ if options <> [] then begin
24752475+ (* Find selected option or use first *)
24762476+ let selected_option =
24772477+ match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with
24782478+ | Some opt -> opt
24792479+ | None -> List.hd options
24802480+ in
24812481+ (* Clone children from selected option to selectedcontent *)
24822482+ List.iter (fun child ->
24832483+ let cloned = Dom.clone ~deep:true child in
24842484+ Dom.append_child selectedcontent cloned
24852485+ ) selected_option.Dom.children
24862486+ end
24872487+ ) selects
24882488+24892489+let finish t =
24902490+ (* Populate selectedcontent elements *)
24912491+ populate_selectedcontent t.document;
24922492+ (* For fragment parsing, remove the html wrapper and promote children *)
24932493+ if t.fragment_context <> None then begin
24942494+ match t.document.Dom.children with
24952495+ | [root] when root.Dom.name = "html" ->
24962496+ (* Move context element's children to root if applicable *)
24972497+ (match t.fragment_context_element with
24982498+ | Some ctx_elem ->
24992499+ (match ctx_elem.Dom.parent with
25002500+ | Some p when p == root ->
25012501+ let ctx_children = ctx_elem.Dom.children in
25022502+ List.iter (fun child ->
25032503+ Dom.remove_child ctx_elem child;
25042504+ Dom.append_child root child
25052505+ ) ctx_children;
25062506+ Dom.remove_child root ctx_elem
25072507+ | _ -> ())
25082508+ | None -> ());
25092509+ (* Promote root's children to document - preserve order *)
25102510+ let children_copy = root.Dom.children in
25112511+ List.iter (fun child ->
25122512+ Dom.remove_child root child;
25132513+ Dom.append_child t.document child
25142514+ ) children_copy;
25152515+ Dom.remove_child t.document root
25162516+ | _ -> ()
25172517+ end;
25182518+ t.document
25192519+25202520+let get_errors t = List.rev t.errors
···11+(* CSS selector lexer *)
22+33+exception Selector_error of string
44+55+type t = {
66+ input : string;
77+ len : int;
88+ mutable pos : int;
99+}
1010+1111+let create input = { input; len = String.length input; pos = 0 }
1212+1313+let peek t =
1414+ if t.pos < t.len then Some t.input.[t.pos]
1515+ else None
1616+1717+let advance t =
1818+ if t.pos < t.len then t.pos <- t.pos + 1
1919+2020+let consume t =
2121+ let c = peek t in
2222+ advance t;
2323+ c
2424+2525+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C'
2626+2727+let is_name_start c =
2828+ (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127
2929+3030+let is_name_char c =
3131+ is_name_start c || (c >= '0' && c <= '9')
3232+3333+let skip_whitespace t =
3434+ while t.pos < t.len && is_whitespace t.input.[t.pos] do
3535+ advance t
3636+ done
3737+3838+let read_name t =
3939+ let start = t.pos in
4040+ while t.pos < t.len && is_name_char t.input.[t.pos] do
4141+ advance t
4242+ done;
4343+ String.sub t.input start (t.pos - start)
4444+4545+let read_string t quote =
4646+ advance t; (* Skip opening quote *)
4747+ let buf = Buffer.create 32 in
4848+ let rec loop () =
4949+ match peek t with
5050+ | None -> raise (Selector_error "Unterminated string")
5151+ | Some c when c = quote -> advance t
5252+ | Some '\\' ->
5353+ advance t;
5454+ (match peek t with
5555+ | Some c -> Buffer.add_char buf c; advance t; loop ()
5656+ | None -> raise (Selector_error "Unterminated escape"))
5757+ | Some c ->
5858+ Buffer.add_char buf c;
5959+ advance t;
6060+ loop ()
6161+ in
6262+ loop ();
6363+ Buffer.contents buf
6464+6565+let read_unquoted_attr_value t =
6666+ let start = t.pos in
6767+ while t.pos < t.len &&
6868+ let c = t.input.[t.pos] in
6969+ not (is_whitespace c) && c <> ']' do
7070+ advance t
7171+ done;
7272+ String.sub t.input start (t.pos - start)
7373+7474+let tokenize input =
7575+ let t = create input in
7676+ let tokens = ref [] in
7777+ let pending_ws = ref false in
7878+7979+ while t.pos < t.len do
8080+ let c = t.input.[t.pos] in
8181+8282+ if is_whitespace c then begin
8383+ pending_ws := true;
8484+ skip_whitespace t
8585+ end else if c = '>' || c = '+' || c = '~' then begin
8686+ pending_ws := false;
8787+ advance t;
8888+ skip_whitespace t;
8989+ tokens := Selector_token.Combinator (String.make 1 c) :: !tokens
9090+ end else begin
9191+ if !pending_ws && !tokens <> [] && c <> ',' then
9292+ tokens := Selector_token.Combinator " " :: !tokens;
9393+ pending_ws := false;
9494+9595+ match c with
9696+ | '*' ->
9797+ advance t;
9898+ tokens := Selector_token.Universal :: !tokens
9999+ | '#' ->
100100+ advance t;
101101+ let name = read_name t in
102102+ if name = "" then raise (Selector_error "Expected identifier after #");
103103+ tokens := Selector_token.Id name :: !tokens
104104+ | '.' ->
105105+ advance t;
106106+ let name = read_name t in
107107+ if name = "" then raise (Selector_error "Expected identifier after .");
108108+ tokens := Selector_token.Class name :: !tokens
109109+ | '[' ->
110110+ advance t;
111111+ tokens := Selector_token.Attr_start :: !tokens;
112112+ skip_whitespace t;
113113+ let attr_name = read_name t in
114114+ if attr_name = "" then raise (Selector_error "Expected attribute name");
115115+ tokens := Selector_token.Tag attr_name :: !tokens;
116116+ skip_whitespace t;
117117+118118+ (match peek t with
119119+ | Some ']' ->
120120+ advance t;
121121+ tokens := Selector_token.Attr_end :: !tokens
122122+ | Some '=' ->
123123+ advance t;
124124+ tokens := Selector_token.Attr_op "=" :: !tokens;
125125+ skip_whitespace t;
126126+ let value = match peek t with
127127+ | Some '"' -> read_string t '"'
128128+ | Some '\'' -> read_string t '\''
129129+ | _ -> read_unquoted_attr_value t
130130+ in
131131+ tokens := Selector_token.String value :: !tokens;
132132+ skip_whitespace t;
133133+ if peek t <> Some ']' then raise (Selector_error "Expected ]");
134134+ advance t;
135135+ tokens := Selector_token.Attr_end :: !tokens
136136+ | Some ('~' | '|' | '^' | '$' | '*') as op_char ->
137137+ let op_c = Option.get op_char in
138138+ advance t;
139139+ if peek t <> Some '=' then
140140+ raise (Selector_error ("Expected = after " ^ String.make 1 op_c));
141141+ advance t;
142142+ tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens;
143143+ skip_whitespace t;
144144+ let value = match peek t with
145145+ | Some '"' -> read_string t '"'
146146+ | Some '\'' -> read_string t '\''
147147+ | _ -> read_unquoted_attr_value t
148148+ in
149149+ tokens := Selector_token.String value :: !tokens;
150150+ skip_whitespace t;
151151+ if peek t <> Some ']' then raise (Selector_error "Expected ]");
152152+ advance t;
153153+ tokens := Selector_token.Attr_end :: !tokens
154154+ | _ -> raise (Selector_error "Unexpected character in attribute selector"))
155155+156156+ | ',' ->
157157+ advance t;
158158+ skip_whitespace t;
159159+ tokens := Selector_token.Comma :: !tokens
160160+ | ':' ->
161161+ advance t;
162162+ tokens := Selector_token.Colon :: !tokens;
163163+ let name = read_name t in
164164+ if name = "" then raise (Selector_error "Expected pseudo-class name");
165165+ tokens := Selector_token.Tag name :: !tokens;
166166+167167+ if peek t = Some '(' then begin
168168+ advance t;
169169+ tokens := Selector_token.Paren_open :: !tokens;
170170+ skip_whitespace t;
171171+ (* Read argument until closing paren *)
172172+ let depth = ref 1 in
173173+ let start = t.pos in
174174+ while !depth > 0 && t.pos < t.len do
175175+ match t.input.[t.pos] with
176176+ | '(' -> incr depth; advance t
177177+ | ')' -> decr depth; if !depth > 0 then advance t
178178+ | _ -> advance t
179179+ done;
180180+ let arg = String.trim (String.sub t.input start (t.pos - start)) in
181181+ if arg <> "" then tokens := Selector_token.String arg :: !tokens;
182182+ if peek t <> Some ')' then raise (Selector_error "Expected )");
183183+ advance t;
184184+ tokens := Selector_token.Paren_close :: !tokens
185185+ end
186186+ | _ when is_name_start c ->
187187+ let name = read_name t in
188188+ tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens
189189+ | _ ->
190190+ raise (Selector_error ("Unexpected character: " ^ String.make 1 c))
191191+ end
192192+ done;
193193+194194+ tokens := Selector_token.EOF :: !tokens;
195195+ List.rev !tokens
+308
lib/selector/selector_match.ml
···11+(* CSS selector matching *)
22+33+module Dom = Html5rw_dom
44+open Selector_ast
55+66+let is_element node =
77+ let name = node.Dom.name in
88+ name <> "#text" && name <> "#comment" && name <> "#document" &&
99+ name <> "#document-fragment" && name <> "!doctype"
1010+1111+let get_element_children node =
1212+ List.filter is_element node.Dom.children
1313+1414+let get_previous_sibling node =
1515+ match node.Dom.parent with
1616+ | None -> None
1717+ | Some parent ->
1818+ let rec find_prev prev = function
1919+ | [] -> None
2020+ | n :: rest ->
2121+ if n == node then prev
2222+ else if is_element n then find_prev (Some n) rest
2323+ else find_prev prev rest
2424+ in
2525+ find_prev None parent.Dom.children
2626+2727+let is_first_child node =
2828+ match node.Dom.parent with
2929+ | None -> false
3030+ | Some parent ->
3131+ match get_element_children parent with
3232+ | first :: _ -> first == node
3333+ | [] -> false
3434+3535+let is_last_child node =
3636+ match node.Dom.parent with
3737+ | None -> false
3838+ | Some parent ->
3939+ match List.rev (get_element_children parent) with
4040+ | last :: _ -> last == node
4141+ | [] -> false
4242+4343+let is_first_of_type node =
4444+ match node.Dom.parent with
4545+ | None -> false
4646+ | Some parent ->
4747+ let name = String.lowercase_ascii node.Dom.name in
4848+ let rec find = function
4949+ | [] -> false
5050+ | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node
5151+ | _ :: rest -> find rest
5252+ in
5353+ find (get_element_children parent)
5454+5555+let is_last_of_type node =
5656+ match node.Dom.parent with
5757+ | None -> false
5858+ | Some parent ->
5959+ let name = String.lowercase_ascii node.Dom.name in
6060+ let rec find last = function
6161+ | [] -> (match last with Some l -> l == node | None -> false)
6262+ | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest
6363+ | _ :: rest -> find last rest
6464+ in
6565+ find None (get_element_children parent)
6666+6767+let get_index node =
6868+ match node.Dom.parent with
6969+ | None -> 0
7070+ | Some parent ->
7171+ let children = get_element_children parent in
7272+ let rec find idx = function
7373+ | [] -> 0
7474+ | n :: _ when n == node -> idx
7575+ | _ :: rest -> find (idx + 1) rest
7676+ in
7777+ find 1 children
7878+7979+let get_type_index node =
8080+ match node.Dom.parent with
8181+ | None -> 0
8282+ | Some parent ->
8383+ let name = String.lowercase_ascii node.Dom.name in
8484+ let children = get_element_children parent in
8585+ let rec find idx = function
8686+ | [] -> 0
8787+ | n :: _ when n == node -> idx
8888+ | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest
8989+ | _ :: rest -> find idx rest
9090+ in
9191+ find 1 children
9292+9393+(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
9494+let parse_nth expr =
9595+ let expr = String.lowercase_ascii (String.trim expr) in
9696+ if expr = "odd" then Some (2, 1)
9797+ else if expr = "even" then Some (2, 0)
9898+ else
9999+ let expr = String.concat "" (String.split_on_char ' ' expr) in
100100+ if String.contains expr 'n' then
101101+ let parts = String.split_on_char 'n' expr in
102102+ match parts with
103103+ | [a_part; b_part] ->
104104+ let a =
105105+ if a_part = "" || a_part = "+" then 1
106106+ else if a_part = "-" then -1
107107+ else int_of_string_opt a_part |> Option.value ~default:0
108108+ in
109109+ let b =
110110+ if b_part = "" then 0
111111+ else int_of_string_opt b_part |> Option.value ~default:0
112112+ in
113113+ Some (a, b)
114114+ | _ -> None
115115+ else
116116+ match int_of_string_opt expr with
117117+ | Some n -> Some (0, n)
118118+ | None -> None
119119+120120+let matches_nth index a b =
121121+ if a = 0 then index = b
122122+ else
123123+ let diff = index - b in
124124+ if a > 0 then diff >= 0 && diff mod a = 0
125125+ else diff <= 0 && diff mod a = 0
126126+127127+let rec matches_simple node selector =
128128+ if not (is_element node) then false
129129+ else
130130+ match selector.selector_type with
131131+ | Type_universal -> true
132132+ | Type_tag ->
133133+ (match selector.name with
134134+ | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name
135135+ | None -> false)
136136+ | Type_id ->
137137+ (match selector.name with
138138+ | Some id ->
139139+ (match Dom.get_attr node "id" with
140140+ | Some node_id -> node_id = id
141141+ | None -> false)
142142+ | None -> false)
143143+ | Type_class ->
144144+ (match selector.name with
145145+ | Some cls ->
146146+ (match Dom.get_attr node "class" with
147147+ | Some class_attr ->
148148+ let classes = String.split_on_char ' ' class_attr in
149149+ List.mem cls classes
150150+ | None -> false)
151151+ | None -> false)
152152+ | Type_attr ->
153153+ (match selector.name with
154154+ | Some attr_name ->
155155+ let attr_name_lower = String.lowercase_ascii attr_name in
156156+ let node_value =
157157+ List.find_map (fun (k, v) ->
158158+ if String.lowercase_ascii k = attr_name_lower then Some v
159159+ else None
160160+ ) node.Dom.attrs
161161+ in
162162+ (match node_value with
163163+ | None -> false
164164+ | Some _ when selector.operator = None -> true
165165+ | Some attr_value ->
166166+ let value = Option.value selector.value ~default:"" in
167167+ (match selector.operator with
168168+ | Some "=" -> attr_value = value
169169+ | Some "~=" ->
170170+ let words = String.split_on_char ' ' attr_value in
171171+ List.mem value words
172172+ | Some "|=" ->
173173+ attr_value = value || String.length attr_value > String.length value &&
174174+ String.sub attr_value 0 (String.length value) = value &&
175175+ attr_value.[String.length value] = '-'
176176+ | Some "^=" -> value <> "" && String.length attr_value >= String.length value &&
177177+ String.sub attr_value 0 (String.length value) = value
178178+ | Some "$=" -> value <> "" && String.length attr_value >= String.length value &&
179179+ String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value
180180+ | Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value
181181+ | Some _ | None -> false))
182182+ | None -> false)
183183+ | Type_pseudo ->
184184+ (match selector.name with
185185+ | Some "first-child" -> is_first_child node
186186+ | Some "last-child" -> is_last_child node
187187+ | Some "first-of-type" -> is_first_of_type node
188188+ | Some "last-of-type" -> is_last_of_type node
189189+ | Some "only-child" -> is_first_child node && is_last_child node
190190+ | Some "only-of-type" -> is_first_of_type node && is_last_of_type node
191191+ | Some "empty" ->
192192+ not (List.exists (fun c ->
193193+ is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "")
194194+ ) node.Dom.children)
195195+ | Some "root" ->
196196+ (match node.Dom.parent with
197197+ | Some p -> p.Dom.name = "#document" || p.Dom.name = "#document-fragment"
198198+ | None -> false)
199199+ | Some "nth-child" ->
200200+ (match selector.arg with
201201+ | Some arg ->
202202+ (match parse_nth arg with
203203+ | Some (a, b) -> matches_nth (get_index node) a b
204204+ | None -> false)
205205+ | None -> false)
206206+ | Some "nth-of-type" ->
207207+ (match selector.arg with
208208+ | Some arg ->
209209+ (match parse_nth arg with
210210+ | Some (a, b) -> matches_nth (get_type_index node) a b
211211+ | None -> false)
212212+ | None -> false)
213213+ | Some "not" ->
214214+ (match selector.arg with
215215+ | Some arg ->
216216+ (try
217217+ let inner = Selector_parser.parse_selector arg in
218218+ not (matches_selector node inner)
219219+ with _ -> true)
220220+ | None -> true)
221221+ | _ -> false)
222222+223223+and matches_compound node (compound : Selector_ast.compound_selector) =
224224+ List.for_all (matches_simple node) compound.selectors
225225+226226+and matches_complex node complex =
227227+ (* Match from right to left *)
228228+ let parts = List.rev complex.parts in
229229+ match parts with
230230+ | [] -> false
231231+ | (_, rightmost) :: rest ->
232232+ if not (matches_compound node rightmost) then false
233233+ else
234234+ let rec check current remaining =
235235+ match remaining with
236236+ | [] -> true
237237+ | (Some " ", compound) :: rest ->
238238+ (* Descendant combinator *)
239239+ let rec find_ancestor n =
240240+ match n.Dom.parent with
241241+ | None -> false
242242+ | Some p ->
243243+ if matches_compound p compound then check p rest
244244+ else find_ancestor p
245245+ in
246246+ find_ancestor current
247247+ | (Some ">", compound) :: rest ->
248248+ (* Child combinator *)
249249+ (match current.Dom.parent with
250250+ | None -> false
251251+ | Some p ->
252252+ if matches_compound p compound then check p rest
253253+ else false)
254254+ | (Some "+", compound) :: rest ->
255255+ (* Adjacent sibling *)
256256+ (match get_previous_sibling current with
257257+ | None -> false
258258+ | Some sib ->
259259+ if matches_compound sib compound then check sib rest
260260+ else false)
261261+ | (Some "~", compound) :: rest ->
262262+ (* General sibling *)
263263+ let rec find_sibling n =
264264+ match get_previous_sibling n with
265265+ | None -> false
266266+ | Some sib ->
267267+ if matches_compound sib compound then check sib rest
268268+ else find_sibling sib
269269+ in
270270+ find_sibling current
271271+ | (None, compound) :: rest ->
272272+ if matches_compound current compound then check current rest
273273+ else false
274274+ | _ -> false
275275+ in
276276+ check node rest
277277+278278+and matches_selector node selector =
279279+ match selector with
280280+ | Simple s -> matches_simple node s
281281+ | Compound c -> matches_compound node c
282282+ | Complex c -> matches_complex node c
283283+ | List l -> List.exists (fun c -> matches_complex node c) l.selectors
284284+285285+let matches node selector_string =
286286+ try
287287+ let selector = Selector_parser.parse_selector selector_string in
288288+ matches_selector node selector
289289+ with _ -> false
290290+291291+let rec query_descendants node selector results =
292292+ List.iter (fun child ->
293293+ if is_element child && matches_selector child selector then
294294+ results := child :: !results;
295295+ query_descendants child selector results;
296296+ (* Also search template content *)
297297+ (match child.Dom.template_content with
298298+ | Some tc -> query_descendants tc selector results
299299+ | None -> ())
300300+ ) node.Dom.children
301301+302302+let query root selector_string =
303303+ try
304304+ let selector = Selector_parser.parse_selector selector_string in
305305+ let results = ref [] in
306306+ query_descendants root selector results;
307307+ List.rev !results
308308+ with _ -> []
+149
lib/selector/selector_parser.ml
···11+(* CSS selector parser *)
22+33+open Selector_ast
44+open Selector_token
55+66+exception Parse_error of string
77+88+type t = {
99+ tokens : Selector_token.t list;
1010+ mutable pos : int;
1111+}
1212+1313+let create tokens = { tokens; pos = 0 }
1414+1515+let peek t =
1616+ if t.pos < List.length t.tokens then
1717+ List.nth t.tokens t.pos
1818+ else EOF
1919+2020+let advance t =
2121+ if t.pos < List.length t.tokens then
2222+ t.pos <- t.pos + 1
2323+2424+let consume t =
2525+ let tok = peek t in
2626+ advance t;
2727+ tok
2828+2929+let expect t expected =
3030+ let tok = peek t in
3131+ if tok <> expected then
3232+ raise (Parse_error ("Expected " ^ (match expected with EOF -> "EOF" | _ -> "token")))
3333+ else
3434+ advance t
3535+3636+let parse_simple_selector t =
3737+ match peek t with
3838+ | Tag name ->
3939+ advance t;
4040+ Some (make_simple Type_tag ~name ())
4141+ | Universal ->
4242+ advance t;
4343+ Some (make_simple Type_universal ())
4444+ | Id name ->
4545+ advance t;
4646+ Some (make_simple Type_id ~name ())
4747+ | Class name ->
4848+ advance t;
4949+ Some (make_simple Type_class ~name ())
5050+ | Attr_start ->
5151+ advance t;
5252+ let attr_name = match peek t with
5353+ | Tag name -> advance t; name
5454+ | _ -> raise (Parse_error "Expected attribute name")
5555+ in
5656+ (match peek t with
5757+ | Attr_end ->
5858+ advance t;
5959+ Some (make_simple Type_attr ~name:attr_name ())
6060+ | Attr_op op ->
6161+ advance t;
6262+ let value = match peek t with
6363+ | String v -> advance t; v
6464+ | _ -> raise (Parse_error "Expected attribute value")
6565+ in
6666+ (match peek t with
6767+ | Attr_end -> advance t
6868+ | _ -> raise (Parse_error "Expected ]"));
6969+ Some (make_simple Type_attr ~name:attr_name ~operator:op ~value ())
7070+ | _ -> raise (Parse_error "Expected ] or attribute operator"))
7171+ | Colon ->
7272+ advance t;
7373+ let name = match peek t with
7474+ | Tag n -> advance t; n
7575+ | _ -> raise (Parse_error "Expected pseudo-class name")
7676+ in
7777+ let arg = match peek t with
7878+ | Paren_open ->
7979+ advance t;
8080+ let a = match peek t with
8181+ | String s -> advance t; Some s
8282+ | Paren_close -> None
8383+ | _ -> None
8484+ in
8585+ (match peek t with
8686+ | Paren_close -> advance t
8787+ | _ -> raise (Parse_error "Expected )"));
8888+ a
8989+ | _ -> None
9090+ in
9191+ Some (make_simple Type_pseudo ~name ?arg ())
9292+ | _ -> None
9393+9494+let parse_compound_selector t =
9595+ let rec loop acc =
9696+ match parse_simple_selector t with
9797+ | Some s -> loop (s :: acc)
9898+ | None -> acc
9999+ in
100100+ let selectors = List.rev (loop []) in
101101+ if selectors = [] then None
102102+ else Some (make_compound selectors)
103103+104104+let parse_complex_selector t =
105105+ match parse_compound_selector t with
106106+ | None -> None
107107+ | Some first ->
108108+ let parts = ref [(None, first)] in
109109+ let rec loop () =
110110+ match peek t with
111111+ | Combinator comb ->
112112+ advance t;
113113+ (match parse_compound_selector t with
114114+ | None -> raise (Parse_error "Expected selector after combinator")
115115+ | Some compound ->
116116+ parts := (Some comb, compound) :: !parts;
117117+ loop ())
118118+ | _ -> ()
119119+ in
120120+ loop ();
121121+ Some (make_complex (List.rev !parts))
122122+123123+let parse tokens =
124124+ let t = create tokens in
125125+ let rec loop acc =
126126+ match parse_complex_selector t with
127127+ | None -> acc
128128+ | Some sel ->
129129+ (match peek t with
130130+ | Comma ->
131131+ advance t;
132132+ loop (sel :: acc)
133133+ | EOF -> sel :: acc
134134+ | _ -> raise (Parse_error "Unexpected token"))
135135+ in
136136+ let selectors = List.rev (loop []) in
137137+ (match peek t with
138138+ | EOF -> ()
139139+ | _ -> raise (Parse_error "Expected end of selector"));
140140+ match selectors with
141141+ | [] -> raise (Parse_error "Empty selector")
142142+ | [sel] -> Complex sel
143143+ | sels -> List (make_list sels)
144144+145145+let parse_selector input =
146146+ if String.trim input = "" then
147147+ raise (Selector_lexer.Selector_error "Empty selector");
148148+ let tokens = Selector_lexer.tokenize input in
149149+ parse tokens
+17
lib/selector/selector_token.ml
···11+(* CSS selector token types *)
22+33+type t =
44+ | Tag of string
55+ | Id of string
66+ | Class of string
77+ | Universal
88+ | Attr_start
99+ | Attr_end
1010+ | Attr_op of string
1111+ | String of string
1212+ | Combinator of string
1313+ | Comma
1414+ | Colon
1515+ | Paren_open
1616+ | Paren_close
1717+ | EOF
···11+(* Input stream for tokenizer with position tracking using bytesrw
22+33+ This implementation is designed to be as streaming as possible:
44+ - Reads slices on-demand from the Bytes.Reader.t
55+ - Only buffers what's needed for lookahead (typically 1-2 chars)
66+ - Avoids string allocations in hot paths like matches_ci
77+*)
88+99+open Bytesrw
1010+1111+type t = {
1212+ reader : Bytes.Reader.t;
1313+ (* Current slice and position within it *)
1414+ mutable current_slice : Bytes.Slice.t;
1515+ mutable slice_pos : int;
1616+ (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *)
1717+ mutable lookahead : char list;
1818+ (* Position tracking *)
1919+ mutable line : int;
2020+ mutable column : int;
2121+ (* Track if we just saw CR (for CR/LF normalization) *)
2222+ mutable last_was_cr : bool;
2323+}
2424+2525+(* Create a stream from a Bytes.Reader.t *)
2626+let create_from_reader reader =
2727+ let slice = Bytes.Reader.read reader in
2828+ {
2929+ reader;
3030+ current_slice = slice;
3131+ slice_pos = 0;
3232+ lookahead = [];
3333+ line = 1;
3434+ column = 0;
3535+ last_was_cr = false;
3636+ }
3737+3838+(* Create a stream from a string - discouraged, prefer create_from_reader *)
3939+let create input =
4040+ create_from_reader (Bytes.Reader.of_string input)
4141+4242+let position t = (t.line, t.column)
4343+4444+(* Read next raw byte from the stream (before CR/LF normalization) *)
4545+let read_raw_char t =
4646+ (* First check lookahead *)
4747+ match t.lookahead with
4848+ | c :: rest ->
4949+ t.lookahead <- rest;
5050+ Some c
5151+ | [] ->
5252+ (* Check if current slice is exhausted *)
5353+ if Bytes.Slice.is_eod t.current_slice then
5454+ None
5555+ else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin
5656+ (* Get next slice *)
5757+ t.current_slice <- Bytes.Reader.read t.reader;
5858+ t.slice_pos <- 0;
5959+ if Bytes.Slice.is_eod t.current_slice then
6060+ None
6161+ else begin
6262+ let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
6363+ (Bytes.Slice.first t.current_slice + t.slice_pos) in
6464+ t.slice_pos <- t.slice_pos + 1;
6565+ Some c
6666+ end
6767+ end else begin
6868+ let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
6969+ (Bytes.Slice.first t.current_slice + t.slice_pos) in
7070+ t.slice_pos <- t.slice_pos + 1;
7171+ Some c
7272+ end
7373+7474+(* Push a char back to lookahead *)
7575+let push_back_char t c =
7676+ t.lookahead <- c :: t.lookahead
7777+7878+(* Read next char with CR/LF normalization *)
7979+let rec read_normalized_char t =
8080+ match read_raw_char t with
8181+ | None ->
8282+ t.last_was_cr <- false;
8383+ None
8484+ | Some '\r' ->
8585+ t.last_was_cr <- true;
8686+ Some '\n' (* CR becomes LF *)
8787+ | Some '\n' when t.last_was_cr ->
8888+ (* Skip LF after CR - it was already converted *)
8989+ t.last_was_cr <- false;
9090+ read_normalized_char t
9191+ | Some c ->
9292+ t.last_was_cr <- false;
9393+ Some c
9494+9595+let is_eof t =
9696+ t.lookahead = [] &&
9797+ (Bytes.Slice.is_eod t.current_slice ||
9898+ (t.slice_pos >= Bytes.Slice.length t.current_slice &&
9999+ (let next = Bytes.Reader.read t.reader in
100100+ t.current_slice <- next;
101101+ t.slice_pos <- 0;
102102+ Bytes.Slice.is_eod next)))
103103+104104+let peek t =
105105+ match read_normalized_char t with
106106+ | None -> None
107107+ | Some c ->
108108+ push_back_char t c;
109109+ (* Undo last_was_cr if we pushed back a CR-converted LF *)
110110+ if c = '\n' then t.last_was_cr <- false;
111111+ Some c
112112+113113+(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
114114+let peek_chars t n =
115115+ let rec collect acc remaining =
116116+ if remaining <= 0 then (List.rev acc, true)
117117+ else match read_normalized_char t with
118118+ | None -> (List.rev acc, false) (* Not enough chars available *)
119119+ | Some c -> collect (c :: acc) (remaining - 1)
120120+ in
121121+ let (chars, success) = collect [] n in
122122+ (* Always push back characters we read, in reverse order *)
123123+ List.iter (push_back_char t) (List.rev chars);
124124+ t.last_was_cr <- false;
125125+ (chars, success)
126126+127127+(* peek_n returns Some string only when exactly n chars are available
128128+ Avoid using this in hot paths - prefer peek_chars + direct comparison *)
129129+let peek_n t n =
130130+ let (chars, success) = peek_chars t n in
131131+ if success then
132132+ Some (String.init n (fun i -> List.nth chars i))
133133+ else
134134+ None
135135+136136+let advance t =
137137+ match read_normalized_char t with
138138+ | None -> ()
139139+ | Some c ->
140140+ (* Update position tracking *)
141141+ if c = '\n' then begin
142142+ t.line <- t.line + 1;
143143+ t.column <- 0
144144+ end else
145145+ t.column <- t.column + 1
146146+147147+let consume t =
148148+ let c = peek t in
149149+ advance t;
150150+ c
151151+152152+let consume_if t pred =
153153+ match peek t with
154154+ | Some c when pred c -> advance t; Some c
155155+ | _ -> None
156156+157157+let consume_while t pred =
158158+ let buf = Buffer.create 16 in
159159+ let rec loop () =
160160+ match peek t with
161161+ | Some c when pred c ->
162162+ Buffer.add_char buf c;
163163+ advance t;
164164+ loop ()
165165+ | _ -> ()
166166+ in
167167+ loop ();
168168+ Buffer.contents buf
169169+170170+(* Case-insensitive match without allocating a string
171171+ Compares directly with the char list from peek_chars *)
172172+let matches_ci t s =
173173+ let slen = String.length s in
174174+ let (chars, success) = peek_chars t slen in
175175+ if not success then false
176176+ else begin
177177+ let rec check chars_remaining i =
178178+ match chars_remaining with
179179+ | [] -> i >= slen (* Matched all *)
180180+ | c :: rest ->
181181+ if i >= slen then true
182182+ else
183183+ let c1 = Char.lowercase_ascii c in
184184+ let c2 = Char.lowercase_ascii (String.unsafe_get s i) in
185185+ if c1 = c2 then check rest (i + 1)
186186+ else false
187187+ in
188188+ check chars 0
189189+ end
190190+191191+let consume_exact_ci t s =
192192+ if matches_ci t s then begin
193193+ for _ = 1 to String.length s do advance t done;
194194+ true
195195+ end else false
196196+197197+let reconsume t =
198198+ (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *)
199199+ (* This is called after advance, so we just need to push back a placeholder *)
200200+ (* The tokenizer will call peek again which will get the right character *)
201201+ (* Actually, for reconsume we need to track what we last consumed *)
202202+ (* For now, just adjust column *)
203203+ if t.column > 0 then t.column <- t.column - 1
+39
lib/tokenizer/token.ml
···11+(* HTML5 token types *)
22+33+type tag_kind = Start | End
44+55+type doctype = {
66+ name : string option;
77+ public_id : string option;
88+ system_id : string option;
99+ force_quirks : bool;
1010+}
1111+1212+type tag = {
1313+ kind : tag_kind;
1414+ name : string;
1515+ attrs : (string * string) list;
1616+ self_closing : bool;
1717+}
1818+1919+type t =
2020+ | Tag of tag
2121+ | Character of string
2222+ | Comment of string
2323+ | Doctype of doctype
2424+ | EOF
2525+2626+let make_start_tag name attrs self_closing =
2727+ Tag { kind = Start; name; attrs; self_closing }
2828+2929+let make_end_tag name =
3030+ Tag { kind = End; name; attrs = []; self_closing = false }
3131+3232+let make_doctype ?name ?public_id ?system_id ?(force_quirks=false) () =
3333+ Doctype { name; public_id; system_id; force_quirks }
3434+3535+let make_comment data = Comment data
3636+3737+let make_character data = Character data
3838+3939+let eof = EOF
+1842
lib/tokenizer/tokenizer.ml
···11+(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
22+33+let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
44+let is_ascii_upper c = c >= 'A' && c <= 'Z'
55+let is_ascii_digit c = c >= '0' && c <= '9'
66+let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
77+let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c
88+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
99+1010+let ascii_lower c =
1111+ if is_ascii_upper c then Char.chr (Char.code c + 32) else c
1212+1313+(* Token sink interface *)
1414+module type SINK = sig
1515+ type t
1616+ val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
1717+ val adjusted_current_node_in_html_namespace : t -> bool
1818+end
1919+2020+type 'sink t = {
2121+ mutable stream : Stream.t;
2222+ sink : 'sink;
2323+ mutable state : State.t;
2424+ mutable return_state : State.t;
2525+ mutable char_ref_code : int;
2626+ mutable temp_buffer : Buffer.t;
2727+ mutable last_start_tag : string;
2828+ mutable current_tag_name : Buffer.t;
2929+ mutable current_tag_kind : Token.tag_kind;
3030+ mutable current_tag_self_closing : bool;
3131+ mutable current_attr_name : Buffer.t;
3232+ mutable current_attr_value : Buffer.t;
3333+ mutable current_attrs : (string * string) list;
3434+ mutable current_doctype_name : Buffer.t option;
3535+ mutable current_doctype_public : Buffer.t option;
3636+ mutable current_doctype_system : Buffer.t option;
3737+ mutable current_doctype_force_quirks : bool;
3838+ mutable current_comment : Buffer.t;
3939+ mutable pending_chars : Buffer.t;
4040+ mutable errors : Errors.t list;
4141+ collect_errors : bool;
4242+}
4343+4444+let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = {
4545+ stream = Stream.create "";
4646+ sink;
4747+ state = State.Data;
4848+ return_state = State.Data;
4949+ char_ref_code = 0;
5050+ temp_buffer = Buffer.create 64;
5151+ last_start_tag = "";
5252+ current_tag_name = Buffer.create 32;
5353+ current_tag_kind = Token.Start;
5454+ current_tag_self_closing = false;
5555+ current_attr_name = Buffer.create 32;
5656+ current_attr_value = Buffer.create 64;
5757+ current_attrs = [];
5858+ current_doctype_name = None;
5959+ current_doctype_public = None;
6060+ current_doctype_system = None;
6161+ current_doctype_force_quirks = false;
6262+ current_comment = Buffer.create 64;
6363+ pending_chars = Buffer.create 256;
6464+ errors = [];
6565+ collect_errors;
6666+}
6767+6868+let error t code =
6969+ if t.collect_errors then begin
7070+ let (line, column) = Stream.position t.stream in
7171+ t.errors <- Errors.make ~code ~line ~column :: t.errors
7272+ end
7373+7474+(* emit functions are defined locally inside run *)
7575+7676+let emit_char t c =
7777+ Buffer.add_char t.pending_chars c
7878+7979+let emit_str t s =
8080+ Buffer.add_string t.pending_chars s
8181+8282+let start_new_tag t kind =
8383+ Buffer.clear t.current_tag_name;
8484+ t.current_tag_kind <- kind;
8585+ t.current_tag_self_closing <- false;
8686+ t.current_attrs <- []
8787+8888+let start_new_attribute t =
8989+ (* Save previous attribute if any *)
9090+ let name = Buffer.contents t.current_attr_name in
9191+ if String.length name > 0 then begin
9292+ let value = Buffer.contents t.current_attr_value in
9393+ (* Check for duplicates - only add if not already present *)
9494+ if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
9595+ t.current_attrs <- (name, value) :: t.current_attrs
9696+ else
9797+ error t "duplicate-attribute"
9898+ end;
9999+ Buffer.clear t.current_attr_name;
100100+ Buffer.clear t.current_attr_value
101101+102102+let finish_attribute t =
103103+ start_new_attribute t
104104+105105+let start_new_doctype t =
106106+ t.current_doctype_name <- None;
107107+ t.current_doctype_public <- None;
108108+ t.current_doctype_system <- None;
109109+ t.current_doctype_force_quirks <- false
110110+111111+(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
112112+113113+let is_appropriate_end_tag t =
114114+ let name = Buffer.contents t.current_tag_name in
115115+ String.length t.last_start_tag > 0 && name = t.last_start_tag
116116+117117+let flush_code_points_consumed_as_char_ref t =
118118+ let s = Buffer.contents t.temp_buffer in
119119+ match t.return_state with
120120+ | State.Attribute_value_double_quoted
121121+ | State.Attribute_value_single_quoted
122122+ | State.Attribute_value_unquoted ->
123123+ Buffer.add_string t.current_attr_value s
124124+ | _ ->
125125+ emit_str t s
126126+127127+open Bytesrw
128128+129129+(* Main tokenization loop *)
130130+let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
131131+ t.stream <- Stream.create_from_reader reader;
132132+ t.errors <- [];
133133+134134+ (* Local emit functions with access to S *)
135135+ let emit_pending_chars () =
136136+ if Buffer.length t.pending_chars > 0 then begin
137137+ let data = Buffer.contents t.pending_chars in
138138+ Buffer.clear t.pending_chars;
139139+ ignore (S.process t.sink (Token.Character data))
140140+ end
141141+ in
142142+143143+ let emit token =
144144+ emit_pending_chars ();
145145+ match S.process t.sink token with
146146+ | `Continue -> ()
147147+ | `SwitchTo new_state -> t.state <- new_state
148148+ in
149149+150150+ let emit_current_tag () =
151151+ finish_attribute t;
152152+ let name = Buffer.contents t.current_tag_name in
153153+ let tag = {
154154+ Token.kind = t.current_tag_kind;
155155+ name;
156156+ attrs = List.rev t.current_attrs;
157157+ self_closing = t.current_tag_self_closing;
158158+ } in
159159+ if t.current_tag_kind = Token.Start then
160160+ t.last_start_tag <- name;
161161+ emit (Token.Tag tag)
162162+ in
163163+164164+ let emit_current_doctype () =
165165+ let doctype = {
166166+ Token.name = Option.map Buffer.contents t.current_doctype_name;
167167+ public_id = Option.map Buffer.contents t.current_doctype_public;
168168+ system_id = Option.map Buffer.contents t.current_doctype_system;
169169+ force_quirks = t.current_doctype_force_quirks;
170170+ } in
171171+ emit (Token.Doctype doctype)
172172+ in
173173+174174+ let emit_current_comment () =
175175+ emit (Token.Comment (Buffer.contents t.current_comment))
176176+ in
177177+178178+ let rec process_state () =
179179+ if Stream.is_eof t.stream && t.state <> State.Data then begin
180180+ (* Handle EOF in various states *)
181181+ handle_eof ()
182182+ end else if Stream.is_eof t.stream then begin
183183+ emit_pending_chars ();
184184+ ignore (S.process t.sink Token.EOF)
185185+ end else begin
186186+ step ();
187187+ process_state ()
188188+ end
189189+190190+ and handle_eof () =
191191+ match t.state with
192192+ | State.Data ->
193193+ emit_pending_chars ();
194194+ ignore (S.process t.sink Token.EOF)
195195+ | State.Tag_open ->
196196+ error t "eof-before-tag-name";
197197+ emit_char t '<';
198198+ emit_pending_chars ();
199199+ ignore (S.process t.sink Token.EOF)
200200+ | State.End_tag_open ->
201201+ error t "eof-before-tag-name";
202202+ emit_str t "</";
203203+ emit_pending_chars ();
204204+ ignore (S.process t.sink Token.EOF)
205205+ | State.Tag_name
206206+ | State.Before_attribute_name
207207+ | State.Attribute_name
208208+ | State.After_attribute_name
209209+ | State.Before_attribute_value
210210+ | State.Attribute_value_double_quoted
211211+ | State.Attribute_value_single_quoted
212212+ | State.Attribute_value_unquoted
213213+ | State.After_attribute_value_quoted
214214+ | State.Self_closing_start_tag ->
215215+ error t "eof-in-tag";
216216+ emit_pending_chars ();
217217+ ignore (S.process t.sink Token.EOF)
218218+ | State.Rawtext ->
219219+ emit_pending_chars ();
220220+ ignore (S.process t.sink Token.EOF)
221221+ | State.Rawtext_less_than_sign ->
222222+ emit_char t '<';
223223+ emit_pending_chars ();
224224+ ignore (S.process t.sink Token.EOF)
225225+ | State.Rawtext_end_tag_open ->
226226+ emit_str t "</";
227227+ emit_pending_chars ();
228228+ ignore (S.process t.sink Token.EOF)
229229+ | State.Rawtext_end_tag_name ->
230230+ emit_str t "</";
231231+ emit_str t (Buffer.contents t.temp_buffer);
232232+ emit_pending_chars ();
233233+ ignore (S.process t.sink Token.EOF)
234234+ | State.Rcdata ->
235235+ emit_pending_chars ();
236236+ ignore (S.process t.sink Token.EOF)
237237+ | State.Rcdata_less_than_sign ->
238238+ emit_char t '<';
239239+ emit_pending_chars ();
240240+ ignore (S.process t.sink Token.EOF)
241241+ | State.Rcdata_end_tag_open ->
242242+ emit_str t "</";
243243+ emit_pending_chars ();
244244+ ignore (S.process t.sink Token.EOF)
245245+ | State.Rcdata_end_tag_name ->
246246+ emit_str t "</";
247247+ emit_str t (Buffer.contents t.temp_buffer);
248248+ emit_pending_chars ();
249249+ ignore (S.process t.sink Token.EOF)
250250+ | State.Script_data ->
251251+ emit_pending_chars ();
252252+ ignore (S.process t.sink Token.EOF)
253253+ | State.Script_data_less_than_sign ->
254254+ emit_char t '<';
255255+ emit_pending_chars ();
256256+ ignore (S.process t.sink Token.EOF)
257257+ | State.Script_data_end_tag_open ->
258258+ emit_str t "</";
259259+ emit_pending_chars ();
260260+ ignore (S.process t.sink Token.EOF)
261261+ | State.Script_data_end_tag_name ->
262262+ emit_str t "</";
263263+ emit_str t (Buffer.contents t.temp_buffer);
264264+ emit_pending_chars ();
265265+ ignore (S.process t.sink Token.EOF)
266266+ | State.Script_data_escape_start
267267+ | State.Script_data_escape_start_dash
268268+ | State.Script_data_escaped
269269+ | State.Script_data_escaped_dash
270270+ | State.Script_data_escaped_dash_dash ->
271271+ emit_pending_chars ();
272272+ ignore (S.process t.sink Token.EOF)
273273+ | State.Script_data_escaped_less_than_sign ->
274274+ emit_char t '<';
275275+ emit_pending_chars ();
276276+ ignore (S.process t.sink Token.EOF)
277277+ | State.Script_data_escaped_end_tag_open ->
278278+ emit_str t "</";
279279+ emit_pending_chars ();
280280+ ignore (S.process t.sink Token.EOF)
281281+ | State.Script_data_escaped_end_tag_name ->
282282+ emit_str t "</";
283283+ emit_str t (Buffer.contents t.temp_buffer);
284284+ emit_pending_chars ();
285285+ ignore (S.process t.sink Token.EOF)
286286+ | State.Script_data_double_escape_start
287287+ | State.Script_data_double_escaped
288288+ | State.Script_data_double_escaped_dash
289289+ | State.Script_data_double_escaped_dash_dash ->
290290+ emit_pending_chars ();
291291+ ignore (S.process t.sink Token.EOF)
292292+ | State.Script_data_double_escaped_less_than_sign ->
293293+ (* '<' was already emitted when entering this state from Script_data_double_escaped *)
294294+ emit_pending_chars ();
295295+ ignore (S.process t.sink Token.EOF)
296296+ | State.Script_data_double_escape_end ->
297297+ emit_pending_chars ();
298298+ ignore (S.process t.sink Token.EOF)
299299+ | State.Plaintext ->
300300+ emit_pending_chars ();
301301+ ignore (S.process t.sink Token.EOF)
302302+ | State.Comment_start
303303+ | State.Comment_start_dash
304304+ | State.Comment
305305+ | State.Comment_less_than_sign
306306+ | State.Comment_less_than_sign_bang
307307+ | State.Comment_less_than_sign_bang_dash
308308+ | State.Comment_less_than_sign_bang_dash_dash
309309+ | State.Comment_end_dash
310310+ | State.Comment_end
311311+ | State.Comment_end_bang ->
312312+ error t "eof-in-comment";
313313+ emit_current_comment ();
314314+ emit_pending_chars ();
315315+ ignore (S.process t.sink Token.EOF)
316316+ | State.Bogus_comment ->
317317+ emit_current_comment ();
318318+ emit_pending_chars ();
319319+ ignore (S.process t.sink Token.EOF)
320320+ | State.Markup_declaration_open ->
321321+ error t "incorrectly-opened-comment";
322322+ Buffer.clear t.current_comment;
323323+ emit_current_comment ();
324324+ emit_pending_chars ();
325325+ ignore (S.process t.sink Token.EOF)
326326+ | State.Doctype
327327+ | State.Before_doctype_name ->
328328+ error t "eof-in-doctype";
329329+ start_new_doctype t;
330330+ t.current_doctype_force_quirks <- true;
331331+ emit_current_doctype ();
332332+ emit_pending_chars ();
333333+ ignore (S.process t.sink Token.EOF)
334334+ | State.Doctype_name
335335+ | State.After_doctype_name
336336+ | State.After_doctype_public_keyword
337337+ | State.Before_doctype_public_identifier
338338+ | State.Doctype_public_identifier_double_quoted
339339+ | State.Doctype_public_identifier_single_quoted
340340+ | State.After_doctype_public_identifier
341341+ | State.Between_doctype_public_and_system_identifiers
342342+ | State.After_doctype_system_keyword
343343+ | State.Before_doctype_system_identifier
344344+ | State.Doctype_system_identifier_double_quoted
345345+ | State.Doctype_system_identifier_single_quoted
346346+ | State.After_doctype_system_identifier ->
347347+ error t "eof-in-doctype";
348348+ t.current_doctype_force_quirks <- true;
349349+ emit_current_doctype ();
350350+ emit_pending_chars ();
351351+ ignore (S.process t.sink Token.EOF)
352352+ | State.Bogus_doctype ->
353353+ emit_current_doctype ();
354354+ emit_pending_chars ();
355355+ ignore (S.process t.sink Token.EOF)
356356+ | State.Cdata_section ->
357357+ error t "eof-in-cdata";
358358+ emit_pending_chars ();
359359+ ignore (S.process t.sink Token.EOF)
360360+ | State.Cdata_section_bracket ->
361361+ error t "eof-in-cdata";
362362+ emit_char t ']';
363363+ emit_pending_chars ();
364364+ ignore (S.process t.sink Token.EOF)
365365+ | State.Cdata_section_end ->
366366+ error t "eof-in-cdata";
367367+ emit_str t "]]";
368368+ emit_pending_chars ();
369369+ ignore (S.process t.sink Token.EOF)
370370+ | State.Character_reference ->
371371+ (* state_character_reference never ran, so initialize temp_buffer with & *)
372372+ Buffer.clear t.temp_buffer;
373373+ Buffer.add_char t.temp_buffer '&';
374374+ flush_code_points_consumed_as_char_ref t;
375375+ t.state <- t.return_state;
376376+ handle_eof ()
377377+ | State.Named_character_reference
378378+ | State.Numeric_character_reference
379379+ | State.Hexadecimal_character_reference_start
380380+ | State.Decimal_character_reference_start
381381+ | State.Numeric_character_reference_end ->
382382+ flush_code_points_consumed_as_char_ref t;
383383+ t.state <- t.return_state;
384384+ handle_eof ()
385385+ | State.Ambiguous_ampersand ->
386386+ (* Buffer was already flushed when entering this state, just transition *)
387387+ t.state <- t.return_state;
388388+ handle_eof ()
389389+ | State.Hexadecimal_character_reference
390390+ | State.Decimal_character_reference ->
391391+ (* At EOF with collected digits - convert the numeric reference *)
392392+ error t "missing-semicolon-after-character-reference";
393393+ let code = t.char_ref_code in
394394+ let replacement_char = "\xEF\xBF\xBD" in
395395+ let result =
396396+ if code = 0 then begin
397397+ error t "null-character-reference";
398398+ replacement_char
399399+ end else if code > 0x10FFFF then begin
400400+ error t "character-reference-outside-unicode-range";
401401+ replacement_char
402402+ end else if code >= 0xD800 && code <= 0xDFFF then begin
403403+ error t "surrogate-character-reference";
404404+ replacement_char
405405+ end else
406406+ Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
407407+ in
408408+ Buffer.clear t.temp_buffer;
409409+ Buffer.add_string t.temp_buffer result;
410410+ flush_code_points_consumed_as_char_ref t;
411411+ t.state <- t.return_state;
412412+ handle_eof ()
413413+414414+ and step () =
415415+ match t.state with
416416+ | State.Data -> state_data ()
417417+ | State.Rcdata -> state_rcdata ()
418418+ | State.Rawtext -> state_rawtext ()
419419+ | State.Script_data -> state_script_data ()
420420+ | State.Plaintext -> state_plaintext ()
421421+ | State.Tag_open -> state_tag_open ()
422422+ | State.End_tag_open -> state_end_tag_open ()
423423+ | State.Tag_name -> state_tag_name ()
424424+ | State.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
425425+ | State.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
426426+ | State.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
427427+ | State.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
428428+ | State.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
429429+ | State.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
430430+ | State.Script_data_less_than_sign -> state_script_data_less_than_sign ()
431431+ | State.Script_data_end_tag_open -> state_script_data_end_tag_open ()
432432+ | State.Script_data_end_tag_name -> state_script_data_end_tag_name ()
433433+ | State.Script_data_escape_start -> state_script_data_escape_start ()
434434+ | State.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
435435+ | State.Script_data_escaped -> state_script_data_escaped ()
436436+ | State.Script_data_escaped_dash -> state_script_data_escaped_dash ()
437437+ | State.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
438438+ | State.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
439439+ | State.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
440440+ | State.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
441441+ | State.Script_data_double_escape_start -> state_script_data_double_escape_start ()
442442+ | State.Script_data_double_escaped -> state_script_data_double_escaped ()
443443+ | State.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
444444+ | State.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
445445+ | State.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
446446+ | State.Script_data_double_escape_end -> state_script_data_double_escape_end ()
447447+ | State.Before_attribute_name -> state_before_attribute_name ()
448448+ | State.Attribute_name -> state_attribute_name ()
449449+ | State.After_attribute_name -> state_after_attribute_name ()
450450+ | State.Before_attribute_value -> state_before_attribute_value ()
451451+ | State.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
452452+ | State.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
453453+ | State.Attribute_value_unquoted -> state_attribute_value_unquoted ()
454454+ | State.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
455455+ | State.Self_closing_start_tag -> state_self_closing_start_tag ()
456456+ | State.Bogus_comment -> state_bogus_comment ()
457457+ | State.Markup_declaration_open -> state_markup_declaration_open ()
458458+ | State.Comment_start -> state_comment_start ()
459459+ | State.Comment_start_dash -> state_comment_start_dash ()
460460+ | State.Comment -> state_comment ()
461461+ | State.Comment_less_than_sign -> state_comment_less_than_sign ()
462462+ | State.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
463463+ | State.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
464464+ | State.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
465465+ | State.Comment_end_dash -> state_comment_end_dash ()
466466+ | State.Comment_end -> state_comment_end ()
467467+ | State.Comment_end_bang -> state_comment_end_bang ()
468468+ | State.Doctype -> state_doctype ()
469469+ | State.Before_doctype_name -> state_before_doctype_name ()
470470+ | State.Doctype_name -> state_doctype_name ()
471471+ | State.After_doctype_name -> state_after_doctype_name ()
472472+ | State.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
473473+ | State.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
474474+ | State.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
475475+ | State.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
476476+ | State.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
477477+ | State.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
478478+ | State.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
479479+ | State.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
480480+ | State.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
481481+ | State.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
482482+ | State.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
483483+ | State.Bogus_doctype -> state_bogus_doctype ()
484484+ | State.Cdata_section -> state_cdata_section ()
485485+ | State.Cdata_section_bracket -> state_cdata_section_bracket ()
486486+ | State.Cdata_section_end -> state_cdata_section_end ()
487487+ | State.Character_reference -> state_character_reference ()
488488+ | State.Named_character_reference -> state_named_character_reference ()
489489+ | State.Ambiguous_ampersand -> state_ambiguous_ampersand ()
490490+ | State.Numeric_character_reference -> state_numeric_character_reference ()
491491+ | State.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
492492+ | State.Decimal_character_reference_start -> state_decimal_character_reference_start ()
493493+ | State.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
494494+ | State.Decimal_character_reference -> state_decimal_character_reference ()
495495+ | State.Numeric_character_reference_end -> state_numeric_character_reference_end ()
496496+497497+ (* State implementations *)
498498+ and state_data () =
499499+ match Stream.consume t.stream with
500500+ | Some '&' ->
501501+ t.return_state <- State.Data;
502502+ t.state <- State.Character_reference
503503+ | Some '<' ->
504504+ t.state <- State.Tag_open
505505+ | Some '\x00' ->
506506+ (* Emit pending chars first, then emit null separately for proper tree builder handling *)
507507+ emit_pending_chars ();
508508+ error t "unexpected-null-character";
509509+ ignore (S.process t.sink (Token.Character "\x00"))
510510+ | Some c ->
511511+ emit_char t c
512512+ | None -> ()
513513+514514+ and state_rcdata () =
515515+ match Stream.consume t.stream with
516516+ | Some '&' ->
517517+ t.return_state <- State.Rcdata;
518518+ t.state <- State.Character_reference
519519+ | Some '<' ->
520520+ t.state <- State.Rcdata_less_than_sign
521521+ | Some '\x00' ->
522522+ error t "unexpected-null-character";
523523+ emit_str t "\xEF\xBF\xBD"
524524+ | Some c ->
525525+ emit_char t c
526526+ | None -> ()
527527+528528+ and state_rawtext () =
529529+ match Stream.consume t.stream with
530530+ | Some '<' ->
531531+ t.state <- State.Rawtext_less_than_sign
532532+ | Some '\x00' ->
533533+ error t "unexpected-null-character";
534534+ emit_str t "\xEF\xBF\xBD"
535535+ | Some c ->
536536+ emit_char t c
537537+ | None -> ()
538538+539539+ and state_script_data () =
540540+ match Stream.consume t.stream with
541541+ | Some '<' ->
542542+ t.state <- State.Script_data_less_than_sign
543543+ | Some '\x00' ->
544544+ error t "unexpected-null-character";
545545+ emit_str t "\xEF\xBF\xBD"
546546+ | Some c ->
547547+ emit_char t c
548548+ | None -> ()
549549+550550+ and state_plaintext () =
551551+ match Stream.consume t.stream with
552552+ | Some '\x00' ->
553553+ error t "unexpected-null-character";
554554+ emit_str t "\xEF\xBF\xBD"
555555+ | Some c ->
556556+ emit_char t c
557557+ | None -> ()
558558+559559+ and state_tag_open () =
560560+ match Stream.peek t.stream with
561561+ | Some '!' ->
562562+ Stream.advance t.stream;
563563+ t.state <- State.Markup_declaration_open
564564+ | Some '/' ->
565565+ Stream.advance t.stream;
566566+ t.state <- State.End_tag_open
567567+ | Some c when is_ascii_alpha c ->
568568+ start_new_tag t Token.Start;
569569+ t.state <- State.Tag_name
570570+ | Some '?' ->
571571+ error t "unexpected-question-mark-instead-of-tag-name";
572572+ Buffer.clear t.current_comment;
573573+ t.state <- State.Bogus_comment
574574+ | None ->
575575+ error t "eof-before-tag-name";
576576+ emit_char t '<'
577577+ | Some _ ->
578578+ error t "invalid-first-character-of-tag-name";
579579+ emit_char t '<';
580580+ t.state <- State.Data
581581+582582+ and state_end_tag_open () =
583583+ match Stream.peek t.stream with
584584+ | Some c when is_ascii_alpha c ->
585585+ start_new_tag t Token.End;
586586+ t.state <- State.Tag_name
587587+ | Some '>' ->
588588+ Stream.advance t.stream;
589589+ error t "missing-end-tag-name";
590590+ t.state <- State.Data
591591+ | None ->
592592+ error t "eof-before-tag-name";
593593+ emit_str t "</"
594594+ | Some _ ->
595595+ error t "invalid-first-character-of-tag-name";
596596+ Buffer.clear t.current_comment;
597597+ t.state <- State.Bogus_comment
598598+599599+ and state_tag_name () =
600600+ match Stream.consume t.stream with
601601+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
602602+ t.state <- State.Before_attribute_name
603603+ | Some '/' ->
604604+ t.state <- State.Self_closing_start_tag
605605+ | Some '>' ->
606606+ t.state <- State.Data;
607607+ emit_current_tag ()
608608+ | Some '\x00' ->
609609+ error t "unexpected-null-character";
610610+ Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
611611+ | Some c ->
612612+ Buffer.add_char t.current_tag_name (ascii_lower c)
613613+ | None -> ()
614614+615615+ and state_rcdata_less_than_sign () =
616616+ match Stream.peek t.stream with
617617+ | Some '/' ->
618618+ Stream.advance t.stream;
619619+ Buffer.clear t.temp_buffer;
620620+ t.state <- State.Rcdata_end_tag_open
621621+ | _ ->
622622+ emit_char t '<';
623623+ t.state <- State.Rcdata
624624+625625+ and state_rcdata_end_tag_open () =
626626+ match Stream.peek t.stream with
627627+ | Some c when is_ascii_alpha c ->
628628+ start_new_tag t Token.End;
629629+ t.state <- State.Rcdata_end_tag_name
630630+ | _ ->
631631+ emit_str t "</";
632632+ t.state <- State.Rcdata
633633+634634+ and state_rcdata_end_tag_name () =
635635+ match Stream.peek t.stream with
636636+ | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
637637+ Stream.advance t.stream;
638638+ t.state <- State.Before_attribute_name
639639+ | Some '/' when is_appropriate_end_tag t ->
640640+ Stream.advance t.stream;
641641+ t.state <- State.Self_closing_start_tag
642642+ | Some '>' when is_appropriate_end_tag t ->
643643+ Stream.advance t.stream;
644644+ t.state <- State.Data;
645645+ emit_current_tag ()
646646+ | Some c when is_ascii_alpha c ->
647647+ Stream.advance t.stream;
648648+ Buffer.add_char t.current_tag_name (ascii_lower c);
649649+ Buffer.add_char t.temp_buffer c
650650+ | _ ->
651651+ emit_str t "</";
652652+ emit_str t (Buffer.contents t.temp_buffer);
653653+ t.state <- State.Rcdata
654654+655655+ and state_rawtext_less_than_sign () =
656656+ match Stream.peek t.stream with
657657+ | Some '/' ->
658658+ Stream.advance t.stream;
659659+ Buffer.clear t.temp_buffer;
660660+ t.state <- State.Rawtext_end_tag_open
661661+ | _ ->
662662+ emit_char t '<';
663663+ t.state <- State.Rawtext
664664+665665+ and state_rawtext_end_tag_open () =
666666+ match Stream.peek t.stream with
667667+ | Some c when is_ascii_alpha c ->
668668+ start_new_tag t Token.End;
669669+ t.state <- State.Rawtext_end_tag_name
670670+ | _ ->
671671+ emit_str t "</";
672672+ t.state <- State.Rawtext
673673+674674+ and state_rawtext_end_tag_name () =
675675+ match Stream.peek t.stream with
676676+ | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
677677+ Stream.advance t.stream;
678678+ t.state <- State.Before_attribute_name
679679+ | Some '/' when is_appropriate_end_tag t ->
680680+ Stream.advance t.stream;
681681+ t.state <- State.Self_closing_start_tag
682682+ | Some '>' when is_appropriate_end_tag t ->
683683+ Stream.advance t.stream;
684684+ t.state <- State.Data;
685685+ emit_current_tag ()
686686+ | Some c when is_ascii_alpha c ->
687687+ Stream.advance t.stream;
688688+ Buffer.add_char t.current_tag_name (ascii_lower c);
689689+ Buffer.add_char t.temp_buffer c
690690+ | _ ->
691691+ emit_str t "</";
692692+ emit_str t (Buffer.contents t.temp_buffer);
693693+ t.state <- State.Rawtext
694694+695695+ and state_script_data_less_than_sign () =
696696+ match Stream.peek t.stream with
697697+ | Some '/' ->
698698+ Stream.advance t.stream;
699699+ Buffer.clear t.temp_buffer;
700700+ t.state <- State.Script_data_end_tag_open
701701+ | Some '!' ->
702702+ Stream.advance t.stream;
703703+ t.state <- State.Script_data_escape_start;
704704+ emit_str t "<!"
705705+ | _ ->
706706+ emit_char t '<';
707707+ t.state <- State.Script_data
708708+709709+ and state_script_data_end_tag_open () =
710710+ match Stream.peek t.stream with
711711+ | Some c when is_ascii_alpha c ->
712712+ start_new_tag t Token.End;
713713+ t.state <- State.Script_data_end_tag_name
714714+ | _ ->
715715+ emit_str t "</";
716716+ t.state <- State.Script_data
717717+718718+ and state_script_data_end_tag_name () =
719719+ match Stream.peek t.stream with
720720+ | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
721721+ Stream.advance t.stream;
722722+ t.state <- State.Before_attribute_name
723723+ | Some '/' when is_appropriate_end_tag t ->
724724+ Stream.advance t.stream;
725725+ t.state <- State.Self_closing_start_tag
726726+ | Some '>' when is_appropriate_end_tag t ->
727727+ Stream.advance t.stream;
728728+ t.state <- State.Data;
729729+ emit_current_tag ()
730730+ | Some c when is_ascii_alpha c ->
731731+ Stream.advance t.stream;
732732+ Buffer.add_char t.current_tag_name (ascii_lower c);
733733+ Buffer.add_char t.temp_buffer c
734734+ | _ ->
735735+ emit_str t "</";
736736+ emit_str t (Buffer.contents t.temp_buffer);
737737+ t.state <- State.Script_data
738738+739739+ and state_script_data_escape_start () =
740740+ match Stream.peek t.stream with
741741+ | Some '-' ->
742742+ Stream.advance t.stream;
743743+ t.state <- State.Script_data_escape_start_dash;
744744+ emit_char t '-'
745745+ | _ ->
746746+ t.state <- State.Script_data
747747+748748+ and state_script_data_escape_start_dash () =
749749+ match Stream.peek t.stream with
750750+ | Some '-' ->
751751+ Stream.advance t.stream;
752752+ t.state <- State.Script_data_escaped_dash_dash;
753753+ emit_char t '-'
754754+ | _ ->
755755+ t.state <- State.Script_data
756756+757757+ and state_script_data_escaped () =
758758+ match Stream.consume t.stream with
759759+ | Some '-' ->
760760+ t.state <- State.Script_data_escaped_dash;
761761+ emit_char t '-'
762762+ | Some '<' ->
763763+ t.state <- State.Script_data_escaped_less_than_sign
764764+ | Some '\x00' ->
765765+ error t "unexpected-null-character";
766766+ emit_str t "\xEF\xBF\xBD"
767767+ | Some c ->
768768+ emit_char t c
769769+ | None -> ()
770770+771771+ and state_script_data_escaped_dash () =
772772+ match Stream.consume t.stream with
773773+ | Some '-' ->
774774+ t.state <- State.Script_data_escaped_dash_dash;
775775+ emit_char t '-'
776776+ | Some '<' ->
777777+ t.state <- State.Script_data_escaped_less_than_sign
778778+ | Some '\x00' ->
779779+ error t "unexpected-null-character";
780780+ t.state <- State.Script_data_escaped;
781781+ emit_str t "\xEF\xBF\xBD"
782782+ | Some c ->
783783+ t.state <- State.Script_data_escaped;
784784+ emit_char t c
785785+ | None -> ()
786786+787787+ and state_script_data_escaped_dash_dash () =
788788+ match Stream.consume t.stream with
789789+ | Some '-' ->
790790+ emit_char t '-'
791791+ | Some '<' ->
792792+ t.state <- State.Script_data_escaped_less_than_sign
793793+ | Some '>' ->
794794+ t.state <- State.Script_data;
795795+ emit_char t '>'
796796+ | Some '\x00' ->
797797+ error t "unexpected-null-character";
798798+ t.state <- State.Script_data_escaped;
799799+ emit_str t "\xEF\xBF\xBD"
800800+ | Some c ->
801801+ t.state <- State.Script_data_escaped;
802802+ emit_char t c
803803+ | None -> ()
804804+805805+ and state_script_data_escaped_less_than_sign () =
806806+ match Stream.peek t.stream with
807807+ | Some '/' ->
808808+ Stream.advance t.stream;
809809+ Buffer.clear t.temp_buffer;
810810+ t.state <- State.Script_data_escaped_end_tag_open
811811+ | Some c when is_ascii_alpha c ->
812812+ Buffer.clear t.temp_buffer;
813813+ emit_char t '<';
814814+ t.state <- State.Script_data_double_escape_start
815815+ | _ ->
816816+ emit_char t '<';
817817+ t.state <- State.Script_data_escaped
818818+819819+ and state_script_data_escaped_end_tag_open () =
820820+ match Stream.peek t.stream with
821821+ | Some c when is_ascii_alpha c ->
822822+ start_new_tag t Token.End;
823823+ t.state <- State.Script_data_escaped_end_tag_name
824824+ | _ ->
825825+ emit_str t "</";
826826+ t.state <- State.Script_data_escaped
827827+828828+ and state_script_data_escaped_end_tag_name () =
829829+ match Stream.peek t.stream with
830830+ | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
831831+ Stream.advance t.stream;
832832+ t.state <- State.Before_attribute_name
833833+ | Some '/' when is_appropriate_end_tag t ->
834834+ Stream.advance t.stream;
835835+ t.state <- State.Self_closing_start_tag
836836+ | Some '>' when is_appropriate_end_tag t ->
837837+ Stream.advance t.stream;
838838+ t.state <- State.Data;
839839+ emit_current_tag ()
840840+ | Some c when is_ascii_alpha c ->
841841+ Stream.advance t.stream;
842842+ Buffer.add_char t.current_tag_name (ascii_lower c);
843843+ Buffer.add_char t.temp_buffer c
844844+ | _ ->
845845+ emit_str t "</";
846846+ emit_str t (Buffer.contents t.temp_buffer);
847847+ t.state <- State.Script_data_escaped
848848+849849+ and state_script_data_double_escape_start () =
850850+ match Stream.peek t.stream with
851851+ | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
852852+ Stream.advance t.stream;
853853+ let c = Option.get c_opt in
854854+ if Buffer.contents t.temp_buffer = "script" then
855855+ t.state <- State.Script_data_double_escaped
856856+ else
857857+ t.state <- State.Script_data_escaped;
858858+ emit_char t c
859859+ | Some c when is_ascii_alpha c ->
860860+ Stream.advance t.stream;
861861+ Buffer.add_char t.temp_buffer (ascii_lower c);
862862+ emit_char t c
863863+ | _ ->
864864+ t.state <- State.Script_data_escaped
865865+866866+ and state_script_data_double_escaped () =
867867+ match Stream.consume t.stream with
868868+ | Some '-' ->
869869+ t.state <- State.Script_data_double_escaped_dash;
870870+ emit_char t '-'
871871+ | Some '<' ->
872872+ t.state <- State.Script_data_double_escaped_less_than_sign;
873873+ emit_char t '<'
874874+ | Some '\x00' ->
875875+ error t "unexpected-null-character";
876876+ emit_str t "\xEF\xBF\xBD"
877877+ | Some c ->
878878+ emit_char t c
879879+ | None -> ()
880880+881881+ and state_script_data_double_escaped_dash () =
882882+ match Stream.consume t.stream with
883883+ | Some '-' ->
884884+ t.state <- State.Script_data_double_escaped_dash_dash;
885885+ emit_char t '-'
886886+ | Some '<' ->
887887+ t.state <- State.Script_data_double_escaped_less_than_sign;
888888+ emit_char t '<'
889889+ | Some '\x00' ->
890890+ error t "unexpected-null-character";
891891+ t.state <- State.Script_data_double_escaped;
892892+ emit_str t "\xEF\xBF\xBD"
893893+ | Some c ->
894894+ t.state <- State.Script_data_double_escaped;
895895+ emit_char t c
896896+ | None -> ()
897897+898898+ and state_script_data_double_escaped_dash_dash () =
899899+ match Stream.consume t.stream with
900900+ | Some '-' ->
901901+ emit_char t '-'
902902+ | Some '<' ->
903903+ t.state <- State.Script_data_double_escaped_less_than_sign;
904904+ emit_char t '<'
905905+ | Some '>' ->
906906+ t.state <- State.Script_data;
907907+ emit_char t '>'
908908+ | Some '\x00' ->
909909+ error t "unexpected-null-character";
910910+ t.state <- State.Script_data_double_escaped;
911911+ emit_str t "\xEF\xBF\xBD"
912912+ | Some c ->
913913+ t.state <- State.Script_data_double_escaped;
914914+ emit_char t c
915915+ | None -> ()
916916+917917+ and state_script_data_double_escaped_less_than_sign () =
918918+ match Stream.peek t.stream with
919919+ | Some '/' ->
920920+ Stream.advance t.stream;
921921+ Buffer.clear t.temp_buffer;
922922+ t.state <- State.Script_data_double_escape_end;
923923+ emit_char t '/'
924924+ | _ ->
925925+ t.state <- State.Script_data_double_escaped
926926+927927+ and state_script_data_double_escape_end () =
928928+ match Stream.peek t.stream with
929929+ | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
930930+ Stream.advance t.stream;
931931+ let c = Option.get c_opt in
932932+ if Buffer.contents t.temp_buffer = "script" then
933933+ t.state <- State.Script_data_escaped
934934+ else
935935+ t.state <- State.Script_data_double_escaped;
936936+ emit_char t c
937937+ | Some c when is_ascii_alpha c ->
938938+ Stream.advance t.stream;
939939+ Buffer.add_char t.temp_buffer (ascii_lower c);
940940+ emit_char t c
941941+ | _ ->
942942+ t.state <- State.Script_data_double_escaped
943943+944944+ and state_before_attribute_name () =
945945+ match Stream.peek t.stream with
946946+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
947947+ Stream.advance t.stream
948948+ | Some '/' | Some '>' | None ->
949949+ t.state <- State.After_attribute_name
950950+ | Some '=' ->
951951+ Stream.advance t.stream;
952952+ error t "unexpected-equals-sign-before-attribute-name";
953953+ start_new_attribute t;
954954+ Buffer.add_char t.current_attr_name '=';
955955+ t.state <- State.Attribute_name
956956+ | Some _ ->
957957+ start_new_attribute t;
958958+ t.state <- State.Attribute_name
959959+960960+ and state_attribute_name () =
961961+ match Stream.peek t.stream with
962962+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
963963+ Stream.advance t.stream;
964964+ t.state <- State.After_attribute_name
965965+ | Some '/' | Some '>' | None ->
966966+ t.state <- State.After_attribute_name
967967+ | Some '=' ->
968968+ Stream.advance t.stream;
969969+ t.state <- State.Before_attribute_value
970970+ | Some '\x00' ->
971971+ Stream.advance t.stream;
972972+ error t "unexpected-null-character";
973973+ Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
974974+ | Some ('"' | '\'' | '<') as c_opt ->
975975+ Stream.advance t.stream;
976976+ error t "unexpected-character-in-attribute-name";
977977+ Buffer.add_char t.current_attr_name (Option.get c_opt)
978978+ | Some c ->
979979+ Stream.advance t.stream;
980980+ Buffer.add_char t.current_attr_name (ascii_lower c)
981981+982982+ and state_after_attribute_name () =
983983+ match Stream.peek t.stream with
984984+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
985985+ Stream.advance t.stream
986986+ | Some '/' ->
987987+ Stream.advance t.stream;
988988+ t.state <- State.Self_closing_start_tag
989989+ | Some '=' ->
990990+ Stream.advance t.stream;
991991+ t.state <- State.Before_attribute_value
992992+ | Some '>' ->
993993+ Stream.advance t.stream;
994994+ t.state <- State.Data;
995995+ emit_current_tag ()
996996+ | None -> ()
997997+ | Some _ ->
998998+ start_new_attribute t;
999999+ t.state <- State.Attribute_name
10001000+10011001+ and state_before_attribute_value () =
10021002+ match Stream.peek t.stream with
10031003+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
10041004+ Stream.advance t.stream
10051005+ | Some '"' ->
10061006+ Stream.advance t.stream;
10071007+ t.state <- State.Attribute_value_double_quoted
10081008+ | Some '\'' ->
10091009+ Stream.advance t.stream;
10101010+ t.state <- State.Attribute_value_single_quoted
10111011+ | Some '>' ->
10121012+ Stream.advance t.stream;
10131013+ error t "missing-attribute-value";
10141014+ t.state <- State.Data;
10151015+ emit_current_tag ()
10161016+ | _ ->
10171017+ t.state <- State.Attribute_value_unquoted
10181018+10191019+ and state_attribute_value_double_quoted () =
10201020+ match Stream.consume t.stream with
10211021+ | Some '"' ->
10221022+ t.state <- State.After_attribute_value_quoted
10231023+ | Some '&' ->
10241024+ t.return_state <- State.Attribute_value_double_quoted;
10251025+ t.state <- State.Character_reference
10261026+ | Some '\x00' ->
10271027+ error t "unexpected-null-character";
10281028+ Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
10291029+ | Some c ->
10301030+ Buffer.add_char t.current_attr_value c
10311031+ | None -> ()
10321032+10331033+ and state_attribute_value_single_quoted () =
10341034+ match Stream.consume t.stream with
10351035+ | Some '\'' ->
10361036+ t.state <- State.After_attribute_value_quoted
10371037+ | Some '&' ->
10381038+ t.return_state <- State.Attribute_value_single_quoted;
10391039+ t.state <- State.Character_reference
10401040+ | Some '\x00' ->
10411041+ error t "unexpected-null-character";
10421042+ Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
10431043+ | Some c ->
10441044+ Buffer.add_char t.current_attr_value c
10451045+ | None -> ()
10461046+10471047+ and state_attribute_value_unquoted () =
10481048+ match Stream.peek t.stream with
10491049+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
10501050+ Stream.advance t.stream;
10511051+ t.state <- State.Before_attribute_name
10521052+ | Some '&' ->
10531053+ Stream.advance t.stream;
10541054+ t.return_state <- State.Attribute_value_unquoted;
10551055+ t.state <- State.Character_reference
10561056+ | Some '>' ->
10571057+ Stream.advance t.stream;
10581058+ t.state <- State.Data;
10591059+ emit_current_tag ()
10601060+ | Some '\x00' ->
10611061+ Stream.advance t.stream;
10621062+ error t "unexpected-null-character";
10631063+ Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
10641064+ | Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
10651065+ Stream.advance t.stream;
10661066+ error t "unexpected-character-in-unquoted-attribute-value";
10671067+ Buffer.add_char t.current_attr_value (Option.get c_opt)
10681068+ | Some c ->
10691069+ Stream.advance t.stream;
10701070+ Buffer.add_char t.current_attr_value c
10711071+ | None -> ()
10721072+10731073+ and state_after_attribute_value_quoted () =
10741074+ match Stream.peek t.stream with
10751075+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
10761076+ Stream.advance t.stream;
10771077+ t.state <- State.Before_attribute_name
10781078+ | Some '/' ->
10791079+ Stream.advance t.stream;
10801080+ t.state <- State.Self_closing_start_tag
10811081+ | Some '>' ->
10821082+ Stream.advance t.stream;
10831083+ t.state <- State.Data;
10841084+ emit_current_tag ()
10851085+ | None -> ()
10861086+ | Some _ ->
10871087+ error t "missing-whitespace-between-attributes";
10881088+ t.state <- State.Before_attribute_name
10891089+10901090+ and state_self_closing_start_tag () =
10911091+ match Stream.peek t.stream with
10921092+ | Some '>' ->
10931093+ Stream.advance t.stream;
10941094+ t.current_tag_self_closing <- true;
10951095+ t.state <- State.Data;
10961096+ emit_current_tag ()
10971097+ | None -> ()
10981098+ | Some _ ->
10991099+ error t "unexpected-solidus-in-tag";
11001100+ t.state <- State.Before_attribute_name
11011101+11021102+ and state_bogus_comment () =
11031103+ match Stream.consume t.stream with
11041104+ | Some '>' ->
11051105+ t.state <- State.Data;
11061106+ emit_current_comment ()
11071107+ | Some '\x00' ->
11081108+ error t "unexpected-null-character";
11091109+ Buffer.add_string t.current_comment "\xEF\xBF\xBD"
11101110+ | Some c ->
11111111+ Buffer.add_char t.current_comment c
11121112+ | None -> ()
11131113+11141114+ and state_markup_declaration_open () =
11151115+ if Stream.matches_ci t.stream "--" then begin
11161116+ ignore (Stream.consume_exact_ci t.stream "--");
11171117+ Buffer.clear t.current_comment;
11181118+ t.state <- State.Comment_start
11191119+ end else if Stream.matches_ci t.stream "DOCTYPE" then begin
11201120+ ignore (Stream.consume_exact_ci t.stream "DOCTYPE");
11211121+ t.state <- State.Doctype
11221122+ end else if Stream.matches_ci t.stream "[CDATA[" then begin
11231123+ ignore (Stream.consume_exact_ci t.stream "[CDATA[");
11241124+ (* CDATA only allowed in foreign content *)
11251125+ if S.adjusted_current_node_in_html_namespace t.sink then begin
11261126+ error t "cdata-in-html-content";
11271127+ Buffer.clear t.current_comment;
11281128+ Buffer.add_string t.current_comment "[CDATA[";
11291129+ t.state <- State.Bogus_comment
11301130+ end else
11311131+ t.state <- State.Cdata_section
11321132+ end else begin
11331133+ error t "incorrectly-opened-comment";
11341134+ Buffer.clear t.current_comment;
11351135+ t.state <- State.Bogus_comment
11361136+ end
11371137+11381138+ and state_comment_start () =
11391139+ match Stream.peek t.stream with
11401140+ | Some '-' ->
11411141+ Stream.advance t.stream;
11421142+ t.state <- State.Comment_start_dash
11431143+ | Some '>' ->
11441144+ Stream.advance t.stream;
11451145+ error t "abrupt-closing-of-empty-comment";
11461146+ t.state <- State.Data;
11471147+ emit_current_comment ()
11481148+ | _ ->
11491149+ t.state <- State.Comment
11501150+11511151+ and state_comment_start_dash () =
11521152+ match Stream.peek t.stream with
11531153+ | Some '-' ->
11541154+ Stream.advance t.stream;
11551155+ t.state <- State.Comment_end
11561156+ | Some '>' ->
11571157+ Stream.advance t.stream;
11581158+ error t "abrupt-closing-of-empty-comment";
11591159+ t.state <- State.Data;
11601160+ emit_current_comment ()
11611161+ | None -> ()
11621162+ | Some _ ->
11631163+ Buffer.add_char t.current_comment '-';
11641164+ t.state <- State.Comment
11651165+11661166+ and state_comment () =
11671167+ match Stream.consume t.stream with
11681168+ | Some '<' ->
11691169+ Buffer.add_char t.current_comment '<';
11701170+ t.state <- State.Comment_less_than_sign
11711171+ | Some '-' ->
11721172+ t.state <- State.Comment_end_dash
11731173+ | Some '\x00' ->
11741174+ error t "unexpected-null-character";
11751175+ Buffer.add_string t.current_comment "\xEF\xBF\xBD"
11761176+ | Some c ->
11771177+ Buffer.add_char t.current_comment c
11781178+ | None -> ()
11791179+11801180+ and state_comment_less_than_sign () =
11811181+ match Stream.peek t.stream with
11821182+ | Some '!' ->
11831183+ Stream.advance t.stream;
11841184+ Buffer.add_char t.current_comment '!';
11851185+ t.state <- State.Comment_less_than_sign_bang
11861186+ | Some '<' ->
11871187+ Stream.advance t.stream;
11881188+ Buffer.add_char t.current_comment '<'
11891189+ | _ ->
11901190+ t.state <- State.Comment
11911191+11921192+ and state_comment_less_than_sign_bang () =
11931193+ match Stream.peek t.stream with
11941194+ | Some '-' ->
11951195+ Stream.advance t.stream;
11961196+ t.state <- State.Comment_less_than_sign_bang_dash
11971197+ | _ ->
11981198+ t.state <- State.Comment
11991199+12001200+ and state_comment_less_than_sign_bang_dash () =
12011201+ match Stream.peek t.stream with
12021202+ | Some '-' ->
12031203+ Stream.advance t.stream;
12041204+ t.state <- State.Comment_less_than_sign_bang_dash_dash
12051205+ | _ ->
12061206+ t.state <- State.Comment_end_dash
12071207+12081208+ and state_comment_less_than_sign_bang_dash_dash () =
12091209+ match Stream.peek t.stream with
12101210+ | Some '>' | None ->
12111211+ t.state <- State.Comment_end
12121212+ | Some _ ->
12131213+ error t "nested-comment";
12141214+ t.state <- State.Comment_end
12151215+12161216+ and state_comment_end_dash () =
12171217+ match Stream.peek t.stream with
12181218+ | Some '-' ->
12191219+ Stream.advance t.stream;
12201220+ t.state <- State.Comment_end
12211221+ | None -> ()
12221222+ | Some _ ->
12231223+ Buffer.add_char t.current_comment '-';
12241224+ t.state <- State.Comment
12251225+12261226+ and state_comment_end () =
12271227+ match Stream.peek t.stream with
12281228+ | Some '>' ->
12291229+ Stream.advance t.stream;
12301230+ t.state <- State.Data;
12311231+ emit_current_comment ()
12321232+ | Some '!' ->
12331233+ Stream.advance t.stream;
12341234+ t.state <- State.Comment_end_bang
12351235+ | Some '-' ->
12361236+ Stream.advance t.stream;
12371237+ Buffer.add_char t.current_comment '-'
12381238+ | None -> ()
12391239+ | Some _ ->
12401240+ Buffer.add_string t.current_comment "--";
12411241+ t.state <- State.Comment
12421242+12431243+ and state_comment_end_bang () =
12441244+ match Stream.peek t.stream with
12451245+ | Some '-' ->
12461246+ Stream.advance t.stream;
12471247+ Buffer.add_string t.current_comment "--!";
12481248+ t.state <- State.Comment_end_dash
12491249+ | Some '>' ->
12501250+ Stream.advance t.stream;
12511251+ error t "incorrectly-closed-comment";
12521252+ t.state <- State.Data;
12531253+ emit_current_comment ()
12541254+ | None -> ()
12551255+ | Some _ ->
12561256+ Buffer.add_string t.current_comment "--!";
12571257+ t.state <- State.Comment
12581258+12591259+ and state_doctype () =
12601260+ match Stream.peek t.stream with
12611261+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
12621262+ Stream.advance t.stream;
12631263+ t.state <- State.Before_doctype_name
12641264+ | Some '>' ->
12651265+ t.state <- State.Before_doctype_name
12661266+ | None -> ()
12671267+ | Some _ ->
12681268+ error t "missing-whitespace-before-doctype-name";
12691269+ t.state <- State.Before_doctype_name
12701270+12711271+ and state_before_doctype_name () =
12721272+ match Stream.peek t.stream with
12731273+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
12741274+ Stream.advance t.stream
12751275+ | Some '\x00' ->
12761276+ Stream.advance t.stream;
12771277+ error t "unexpected-null-character";
12781278+ start_new_doctype t;
12791279+ t.current_doctype_name <- Some (Buffer.create 8);
12801280+ Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
12811281+ t.state <- State.Doctype_name
12821282+ | Some '>' ->
12831283+ Stream.advance t.stream;
12841284+ error t "missing-doctype-name";
12851285+ start_new_doctype t;
12861286+ t.current_doctype_force_quirks <- true;
12871287+ t.state <- State.Data;
12881288+ emit_current_doctype ()
12891289+ | None -> ()
12901290+ | Some c ->
12911291+ Stream.advance t.stream;
12921292+ start_new_doctype t;
12931293+ t.current_doctype_name <- Some (Buffer.create 8);
12941294+ Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
12951295+ t.state <- State.Doctype_name
12961296+12971297+ and state_doctype_name () =
12981298+ match Stream.consume t.stream with
12991299+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
13001300+ t.state <- State.After_doctype_name
13011301+ | Some '>' ->
13021302+ t.state <- State.Data;
13031303+ emit_current_doctype ()
13041304+ | Some '\x00' ->
13051305+ error t "unexpected-null-character";
13061306+ Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
13071307+ | Some c ->
13081308+ Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
13091309+ | None -> ()
13101310+13111311+ and state_after_doctype_name () =
13121312+ match Stream.peek t.stream with
13131313+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
13141314+ Stream.advance t.stream
13151315+ | Some '>' ->
13161316+ Stream.advance t.stream;
13171317+ t.state <- State.Data;
13181318+ emit_current_doctype ()
13191319+ | None -> ()
13201320+ | Some _ ->
13211321+ if Stream.matches_ci t.stream "PUBLIC" then begin
13221322+ ignore (Stream.consume_exact_ci t.stream "PUBLIC");
13231323+ t.state <- State.After_doctype_public_keyword
13241324+ end else if Stream.matches_ci t.stream "SYSTEM" then begin
13251325+ ignore (Stream.consume_exact_ci t.stream "SYSTEM");
13261326+ t.state <- State.After_doctype_system_keyword
13271327+ end else begin
13281328+ error t "invalid-character-sequence-after-doctype-name";
13291329+ t.current_doctype_force_quirks <- true;
13301330+ t.state <- State.Bogus_doctype
13311331+ end
13321332+13331333+ and state_after_doctype_public_keyword () =
13341334+ match Stream.peek t.stream with
13351335+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
13361336+ Stream.advance t.stream;
13371337+ t.state <- State.Before_doctype_public_identifier
13381338+ | Some '"' ->
13391339+ Stream.advance t.stream;
13401340+ error t "missing-whitespace-after-doctype-public-keyword";
13411341+ t.current_doctype_public <- Some (Buffer.create 32);
13421342+ t.state <- State.Doctype_public_identifier_double_quoted
13431343+ | Some '\'' ->
13441344+ Stream.advance t.stream;
13451345+ error t "missing-whitespace-after-doctype-public-keyword";
13461346+ t.current_doctype_public <- Some (Buffer.create 32);
13471347+ t.state <- State.Doctype_public_identifier_single_quoted
13481348+ | Some '>' ->
13491349+ Stream.advance t.stream;
13501350+ error t "missing-doctype-public-identifier";
13511351+ t.current_doctype_force_quirks <- true;
13521352+ t.state <- State.Data;
13531353+ emit_current_doctype ()
13541354+ | None -> ()
13551355+ | Some _ ->
13561356+ error t "missing-quote-before-doctype-public-identifier";
13571357+ t.current_doctype_force_quirks <- true;
13581358+ t.state <- State.Bogus_doctype
13591359+13601360+ and state_before_doctype_public_identifier () =
13611361+ match Stream.peek t.stream with
13621362+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
13631363+ Stream.advance t.stream
13641364+ | Some '"' ->
13651365+ Stream.advance t.stream;
13661366+ t.current_doctype_public <- Some (Buffer.create 32);
13671367+ t.state <- State.Doctype_public_identifier_double_quoted
13681368+ | Some '\'' ->
13691369+ Stream.advance t.stream;
13701370+ t.current_doctype_public <- Some (Buffer.create 32);
13711371+ t.state <- State.Doctype_public_identifier_single_quoted
13721372+ | Some '>' ->
13731373+ Stream.advance t.stream;
13741374+ error t "missing-doctype-public-identifier";
13751375+ t.current_doctype_force_quirks <- true;
13761376+ t.state <- State.Data;
13771377+ emit_current_doctype ()
13781378+ | None -> ()
13791379+ | Some _ ->
13801380+ error t "missing-quote-before-doctype-public-identifier";
13811381+ t.current_doctype_force_quirks <- true;
13821382+ t.state <- State.Bogus_doctype
13831383+13841384+ and state_doctype_public_identifier_double_quoted () =
13851385+ match Stream.consume t.stream with
13861386+ | Some '"' ->
13871387+ t.state <- State.After_doctype_public_identifier
13881388+ | Some '\x00' ->
13891389+ error t "unexpected-null-character";
13901390+ Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
13911391+ | Some '>' ->
13921392+ error t "abrupt-doctype-public-identifier";
13931393+ t.current_doctype_force_quirks <- true;
13941394+ t.state <- State.Data;
13951395+ emit_current_doctype ()
13961396+ | Some c ->
13971397+ Buffer.add_char (Option.get t.current_doctype_public) c
13981398+ | None -> ()
13991399+14001400+ and state_doctype_public_identifier_single_quoted () =
14011401+ match Stream.consume t.stream with
14021402+ | Some '\'' ->
14031403+ t.state <- State.After_doctype_public_identifier
14041404+ | Some '\x00' ->
14051405+ error t "unexpected-null-character";
14061406+ Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
14071407+ | Some '>' ->
14081408+ error t "abrupt-doctype-public-identifier";
14091409+ t.current_doctype_force_quirks <- true;
14101410+ t.state <- State.Data;
14111411+ emit_current_doctype ()
14121412+ | Some c ->
14131413+ Buffer.add_char (Option.get t.current_doctype_public) c
14141414+ | None -> ()
14151415+14161416+ and state_after_doctype_public_identifier () =
14171417+ match Stream.peek t.stream with
14181418+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
14191419+ Stream.advance t.stream;
14201420+ t.state <- State.Between_doctype_public_and_system_identifiers
14211421+ | Some '>' ->
14221422+ Stream.advance t.stream;
14231423+ t.state <- State.Data;
14241424+ emit_current_doctype ()
14251425+ | Some '"' ->
14261426+ Stream.advance t.stream;
14271427+ error t "missing-whitespace-between-doctype-public-and-system-identifiers";
14281428+ t.current_doctype_system <- Some (Buffer.create 32);
14291429+ t.state <- State.Doctype_system_identifier_double_quoted
14301430+ | Some '\'' ->
14311431+ Stream.advance t.stream;
14321432+ error t "missing-whitespace-between-doctype-public-and-system-identifiers";
14331433+ t.current_doctype_system <- Some (Buffer.create 32);
14341434+ t.state <- State.Doctype_system_identifier_single_quoted
14351435+ | None -> ()
14361436+ | Some _ ->
14371437+ error t "missing-quote-before-doctype-system-identifier";
14381438+ t.current_doctype_force_quirks <- true;
14391439+ t.state <- State.Bogus_doctype
14401440+14411441+ and state_between_doctype_public_and_system_identifiers () =
14421442+ match Stream.peek t.stream with
14431443+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
14441444+ Stream.advance t.stream
14451445+ | Some '>' ->
14461446+ Stream.advance t.stream;
14471447+ t.state <- State.Data;
14481448+ emit_current_doctype ()
14491449+ | Some '"' ->
14501450+ Stream.advance t.stream;
14511451+ t.current_doctype_system <- Some (Buffer.create 32);
14521452+ t.state <- State.Doctype_system_identifier_double_quoted
14531453+ | Some '\'' ->
14541454+ Stream.advance t.stream;
14551455+ t.current_doctype_system <- Some (Buffer.create 32);
14561456+ t.state <- State.Doctype_system_identifier_single_quoted
14571457+ | None -> ()
14581458+ | Some _ ->
14591459+ error t "missing-quote-before-doctype-system-identifier";
14601460+ t.current_doctype_force_quirks <- true;
14611461+ t.state <- State.Bogus_doctype
14621462+14631463+ and state_after_doctype_system_keyword () =
14641464+ match Stream.peek t.stream with
14651465+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
14661466+ Stream.advance t.stream;
14671467+ t.state <- State.Before_doctype_system_identifier
14681468+ | Some '"' ->
14691469+ Stream.advance t.stream;
14701470+ error t "missing-whitespace-after-doctype-system-keyword";
14711471+ t.current_doctype_system <- Some (Buffer.create 32);
14721472+ t.state <- State.Doctype_system_identifier_double_quoted
14731473+ | Some '\'' ->
14741474+ Stream.advance t.stream;
14751475+ error t "missing-whitespace-after-doctype-system-keyword";
14761476+ t.current_doctype_system <- Some (Buffer.create 32);
14771477+ t.state <- State.Doctype_system_identifier_single_quoted
14781478+ | Some '>' ->
14791479+ Stream.advance t.stream;
14801480+ error t "missing-doctype-system-identifier";
14811481+ t.current_doctype_force_quirks <- true;
14821482+ t.state <- State.Data;
14831483+ emit_current_doctype ()
14841484+ | None -> ()
14851485+ | Some _ ->
14861486+ error t "missing-quote-before-doctype-system-identifier";
14871487+ t.current_doctype_force_quirks <- true;
14881488+ t.state <- State.Bogus_doctype
14891489+14901490+ and state_before_doctype_system_identifier () =
14911491+ match Stream.peek t.stream with
14921492+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
14931493+ Stream.advance t.stream
14941494+ | Some '"' ->
14951495+ Stream.advance t.stream;
14961496+ t.current_doctype_system <- Some (Buffer.create 32);
14971497+ t.state <- State.Doctype_system_identifier_double_quoted
14981498+ | Some '\'' ->
14991499+ Stream.advance t.stream;
15001500+ t.current_doctype_system <- Some (Buffer.create 32);
15011501+ t.state <- State.Doctype_system_identifier_single_quoted
15021502+ | Some '>' ->
15031503+ Stream.advance t.stream;
15041504+ error t "missing-doctype-system-identifier";
15051505+ t.current_doctype_force_quirks <- true;
15061506+ t.state <- State.Data;
15071507+ emit_current_doctype ()
15081508+ | None -> ()
15091509+ | Some _ ->
15101510+ error t "missing-quote-before-doctype-system-identifier";
15111511+ t.current_doctype_force_quirks <- true;
15121512+ t.state <- State.Bogus_doctype
15131513+15141514+ and state_doctype_system_identifier_double_quoted () =
15151515+ match Stream.consume t.stream with
15161516+ | Some '"' ->
15171517+ t.state <- State.After_doctype_system_identifier
15181518+ | Some '\x00' ->
15191519+ error t "unexpected-null-character";
15201520+ Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
15211521+ | Some '>' ->
15221522+ error t "abrupt-doctype-system-identifier";
15231523+ t.current_doctype_force_quirks <- true;
15241524+ t.state <- State.Data;
15251525+ emit_current_doctype ()
15261526+ | Some c ->
15271527+ Buffer.add_char (Option.get t.current_doctype_system) c
15281528+ | None -> ()
15291529+15301530+ and state_doctype_system_identifier_single_quoted () =
15311531+ match Stream.consume t.stream with
15321532+ | Some '\'' ->
15331533+ t.state <- State.After_doctype_system_identifier
15341534+ | Some '\x00' ->
15351535+ error t "unexpected-null-character";
15361536+ Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
15371537+ | Some '>' ->
15381538+ error t "abrupt-doctype-system-identifier";
15391539+ t.current_doctype_force_quirks <- true;
15401540+ t.state <- State.Data;
15411541+ emit_current_doctype ()
15421542+ | Some c ->
15431543+ Buffer.add_char (Option.get t.current_doctype_system) c
15441544+ | None -> ()
15451545+15461546+ and state_after_doctype_system_identifier () =
15471547+ match Stream.peek t.stream with
15481548+ | Some ('\t' | '\n' | '\x0C' | ' ') ->
15491549+ Stream.advance t.stream
15501550+ | Some '>' ->
15511551+ Stream.advance t.stream;
15521552+ t.state <- State.Data;
15531553+ emit_current_doctype ()
15541554+ | None -> ()
15551555+ | Some _ ->
15561556+ error t "unexpected-character-after-doctype-system-identifier";
15571557+ t.state <- State.Bogus_doctype
15581558+15591559+ and state_bogus_doctype () =
15601560+ match Stream.consume t.stream with
15611561+ | Some '>' ->
15621562+ t.state <- State.Data;
15631563+ emit_current_doctype ()
15641564+ | Some '\x00' ->
15651565+ error t "unexpected-null-character"
15661566+ | Some _ -> ()
15671567+ | None -> ()
15681568+15691569+ and state_cdata_section () =
15701570+ match Stream.consume t.stream with
15711571+ | Some ']' ->
15721572+ t.state <- State.Cdata_section_bracket
15731573+ | Some '\x00' ->
15741574+ error t "unexpected-null-character";
15751575+ emit_str t "\xEF\xBF\xBD"
15761576+ | Some c ->
15771577+ emit_char t c
15781578+ | None -> ()
15791579+15801580+ and state_cdata_section_bracket () =
15811581+ match Stream.peek t.stream with
15821582+ | Some ']' ->
15831583+ Stream.advance t.stream;
15841584+ t.state <- State.Cdata_section_end
15851585+ | _ ->
15861586+ emit_char t ']';
15871587+ t.state <- State.Cdata_section
15881588+15891589+ and state_cdata_section_end () =
15901590+ match Stream.peek t.stream with
15911591+ | Some ']' ->
15921592+ Stream.advance t.stream;
15931593+ emit_char t ']'
15941594+ | Some '>' ->
15951595+ Stream.advance t.stream;
15961596+ t.state <- State.Data
15971597+ | _ ->
15981598+ emit_str t "]]";
15991599+ t.state <- State.Cdata_section
16001600+16011601+ and state_character_reference () =
16021602+ Buffer.clear t.temp_buffer;
16031603+ Buffer.add_char t.temp_buffer '&';
16041604+ match Stream.peek t.stream with
16051605+ | Some c when is_ascii_alnum c ->
16061606+ t.state <- State.Named_character_reference
16071607+ | Some '#' ->
16081608+ Stream.advance t.stream;
16091609+ Buffer.add_char t.temp_buffer '#';
16101610+ t.state <- State.Numeric_character_reference
16111611+ | _ ->
16121612+ flush_code_points_consumed_as_char_ref t;
16131613+ t.state <- t.return_state
16141614+16151615+ and state_named_character_reference () =
16161616+ (* Collect alphanumeric characters *)
16171617+ let rec collect () =
16181618+ match Stream.peek t.stream with
16191619+ | Some c when is_ascii_alnum c ->
16201620+ Stream.advance t.stream;
16211621+ Buffer.add_char t.temp_buffer c;
16221622+ collect ()
16231623+ | _ -> ()
16241624+ in
16251625+ collect ();
16261626+16271627+ let has_semicolon =
16281628+ match Stream.peek t.stream with
16291629+ | Some ';' -> Stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
16301630+ | _ -> false
16311631+ in
16321632+16331633+ (* Try to match entity - buffer contains "&name" or "&name;" *)
16341634+ let buf_contents = Buffer.contents t.temp_buffer in
16351635+ let name_start = 1 in (* Skip '&' *)
16361636+ let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
16371637+ let entity_name = String.sub buf_contents name_start (name_end - name_start) in
16381638+16391639+ (* Try progressively shorter matches *)
16401640+ (* Only match if:
16411641+ 1. Full match with semicolon, OR
16421642+ 2. Legacy entity (can be used without semicolon) *)
16431643+ let rec try_match len =
16441644+ if len <= 0 then None
16451645+ else
16461646+ let prefix = String.sub entity_name 0 len in
16471647+ let is_full = len = String.length entity_name in
16481648+ let would_have_semi = has_semicolon && is_full in
16491649+ (* Only use this match if it has semicolon or is a legacy entity *)
16501650+ if would_have_semi || Html5rw_entities.is_legacy prefix then
16511651+ match Html5rw_entities.lookup prefix with
16521652+ | Some decoded -> Some (decoded, len)
16531653+ | None -> try_match (len - 1)
16541654+ else
16551655+ try_match (len - 1)
16561656+ in
16571657+16581658+ match try_match (String.length entity_name) with
16591659+ | Some (decoded, matched_len) ->
16601660+ let full_match = matched_len = String.length entity_name in
16611661+ let ends_with_semi = has_semicolon && full_match in
16621662+16631663+ (* Check attribute context restrictions *)
16641664+ let in_attribute = match t.return_state with
16651665+ | State.Attribute_value_double_quoted
16661666+ | State.Attribute_value_single_quoted
16671667+ | State.Attribute_value_unquoted -> true
16681668+ | _ -> false
16691669+ in
16701670+16711671+ let next_char =
16721672+ if full_match && not has_semicolon then
16731673+ Stream.peek t.stream
16741674+ else if not full_match then
16751675+ Some entity_name.[matched_len]
16761676+ else None
16771677+ in
16781678+16791679+ let blocked = in_attribute && not ends_with_semi &&
16801680+ match next_char with
16811681+ | Some '=' -> true
16821682+ | Some c when is_ascii_alnum c -> true
16831683+ | _ -> false
16841684+ in
16851685+16861686+ if blocked then begin
16871687+ flush_code_points_consumed_as_char_ref t;
16881688+ t.state <- t.return_state
16891689+ end else begin
16901690+ if not ends_with_semi then
16911691+ error t "missing-semicolon-after-character-reference";
16921692+ Buffer.clear t.temp_buffer;
16931693+ Buffer.add_string t.temp_buffer decoded;
16941694+ flush_code_points_consumed_as_char_ref t;
16951695+ (* Emit unconsumed chars after partial match *)
16961696+ if not full_match then begin
16971697+ let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
16981698+ emit_str t unconsumed;
16991699+ (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
17001700+ if has_semicolon then
17011701+ emit_char t ';'
17021702+ end;
17031703+ t.state <- t.return_state
17041704+ end
17051705+ | None ->
17061706+ (* No match - check if we should report ambiguous ampersand *)
17071707+ if String.length entity_name > 0 then begin
17081708+ t.state <- State.Ambiguous_ampersand;
17091709+ (* Reset position - we need to emit the ampersand and chars *)
17101710+ flush_code_points_consumed_as_char_ref t
17111711+ end else begin
17121712+ flush_code_points_consumed_as_char_ref t;
17131713+ t.state <- t.return_state
17141714+ end
17151715+17161716+ and state_ambiguous_ampersand () =
17171717+ match Stream.peek t.stream with
17181718+ | Some c when is_ascii_alnum c ->
17191719+ Stream.advance t.stream;
17201720+ (match t.return_state with
17211721+ | State.Attribute_value_double_quoted
17221722+ | State.Attribute_value_single_quoted
17231723+ | State.Attribute_value_unquoted ->
17241724+ Buffer.add_char t.current_attr_value c
17251725+ | _ ->
17261726+ emit_char t c)
17271727+ | Some ';' ->
17281728+ error t "unknown-named-character-reference";
17291729+ t.state <- t.return_state
17301730+ | _ ->
17311731+ t.state <- t.return_state
17321732+17331733+ and state_numeric_character_reference () =
17341734+ t.char_ref_code <- 0;
17351735+ match Stream.peek t.stream with
17361736+ | Some (('x' | 'X') as c) ->
17371737+ Stream.advance t.stream;
17381738+ Buffer.add_char t.temp_buffer c;
17391739+ t.state <- State.Hexadecimal_character_reference_start
17401740+ | _ ->
17411741+ t.state <- State.Decimal_character_reference_start
17421742+17431743+ and state_hexadecimal_character_reference_start () =
17441744+ match Stream.peek t.stream with
17451745+ | Some c when is_ascii_hex c ->
17461746+ t.state <- State.Hexadecimal_character_reference
17471747+ | _ ->
17481748+ error t "absence-of-digits-in-numeric-character-reference";
17491749+ flush_code_points_consumed_as_char_ref t;
17501750+ t.state <- t.return_state
17511751+17521752+ and state_decimal_character_reference_start () =
17531753+ match Stream.peek t.stream with
17541754+ | Some c when is_ascii_digit c ->
17551755+ t.state <- State.Decimal_character_reference
17561756+ | _ ->
17571757+ error t "absence-of-digits-in-numeric-character-reference";
17581758+ flush_code_points_consumed_as_char_ref t;
17591759+ t.state <- t.return_state
17601760+17611761+ and state_hexadecimal_character_reference () =
17621762+ match Stream.peek t.stream with
17631763+ | Some c when is_ascii_digit c ->
17641764+ Stream.advance t.stream;
17651765+ t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
17661766+ if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
17671767+ | Some c when c >= 'A' && c <= 'F' ->
17681768+ Stream.advance t.stream;
17691769+ t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
17701770+ if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
17711771+ | Some c when c >= 'a' && c <= 'f' ->
17721772+ Stream.advance t.stream;
17731773+ t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
17741774+ if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
17751775+ | Some ';' ->
17761776+ Stream.advance t.stream;
17771777+ t.state <- State.Numeric_character_reference_end
17781778+ | _ ->
17791779+ error t "missing-semicolon-after-character-reference";
17801780+ t.state <- State.Numeric_character_reference_end
17811781+17821782+ and state_decimal_character_reference () =
17831783+ match Stream.peek t.stream with
17841784+ | Some c when is_ascii_digit c ->
17851785+ Stream.advance t.stream;
17861786+ t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
17871787+ if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
17881788+ | Some ';' ->
17891789+ Stream.advance t.stream;
17901790+ t.state <- State.Numeric_character_reference_end
17911791+ | _ ->
17921792+ error t "missing-semicolon-after-character-reference";
17931793+ t.state <- State.Numeric_character_reference_end
17941794+17951795+ and state_numeric_character_reference_end () =
17961796+ let code = t.char_ref_code in
17971797+ let replacement_char = "\xEF\xBF\xBD" in
17981798+17991799+ let result =
18001800+ if code = 0 then begin
18011801+ error t "null-character-reference";
18021802+ replacement_char
18031803+ end else if code > 0x10FFFF then begin
18041804+ error t "character-reference-outside-unicode-range";
18051805+ replacement_char
18061806+ end else if code >= 0xD800 && code <= 0xDFFF then begin
18071807+ error t "surrogate-character-reference";
18081808+ replacement_char
18091809+ end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
18101810+ List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
18111811+ 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
18121812+ 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
18131813+ 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
18141814+ 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
18151815+ 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
18161816+ error t "noncharacter-character-reference";
18171817+ Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
18181818+ end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
18191819+ (code >= 0x0D && code <= 0x1F) ||
18201820+ (code >= 0x7F && code <= 0x9F) then begin
18211821+ error t "control-character-reference";
18221822+ (* Apply Windows-1252 replacement table for 0x80-0x9F *)
18231823+ match Html5rw_entities.Numeric_ref.find_replacement code with
18241824+ | Some replacement -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 replacement
18251825+ | None -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
18261826+ end else
18271827+ Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
18281828+ in
18291829+18301830+ Buffer.clear t.temp_buffer;
18311831+ Buffer.add_string t.temp_buffer result;
18321832+ flush_code_points_consumed_as_char_ref t;
18331833+ t.state <- t.return_state
18341834+18351835+ in
18361836+ process_state ()
18371837+18381838+let get_errors t = List.rev t.errors
18391839+18401840+let set_state t state = t.state <- state
18411841+18421842+let set_last_start_tag t name = t.last_start_tag <- name
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ print_endline "=== Test: &AMp; ===";
88+ let input = "&AMp;" in
99+ print_endline ("Input: " ^ input);
1010+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1111+ print_endline "Result:";
1212+ print_endline (Dom.to_test_format (Parser.root result));
1313+ print_endline "";
1414+1515+ print_endline "=== Test: & ===";
1616+ let input = "&" in
1717+ print_endline ("Input: " ^ input);
1818+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1919+ print_endline "Result:";
2020+ print_endline (Dom.to_test_format (Parser.root result))
+28
test/entity_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ print_endline "=== Test 1: Single & ===";
88+ let input = "&" in
99+ print_endline ("Input: " ^ input);
1010+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1111+ print_endline "Result:";
1212+ print_endline (Dom.to_test_format (Parser.root result));
1313+ print_endline "";
1414+1515+ print_endline "=== Test 2: - (decimal ref) ===";
1616+ let input = "-" in
1717+ print_endline ("Input: " ^ input);
1818+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1919+ print_endline "Result:";
2020+ print_endline (Dom.to_test_format (Parser.root result));
2121+ print_endline "";
2222+2323+ print_endline "=== Test 3: &#X (hex ref incomplete) ===";
2424+ let input = "&#X" in
2525+ print_endline ("Input: " ^ input);
2626+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
2727+ print_endline "Result:";
2828+ print_endline (Dom.to_test_format (Parser.root result))
+39
test/frag_debug.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+55+let () =
66+ (* Test 77 - template with adoption agency *)
77+ print_endline "=== Template test 77 ===";
88+ print_endline "Input: <body><template><i><menu>Foo</i>";
99+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><template><i><menu>Foo</i>") in
1010+ print_endline "Actual:";
1111+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1212+ print_newline ();
1313+1414+ (* Simpler test - just template with content *)
1515+ print_endline "=== Simpler template test ===";
1616+ print_endline "Input: <template><i>X</i></template>";
1717+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><i>X</i></template>") in
1818+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1919+ print_newline ();
2020+2121+ (* Test without template *)
2222+ print_endline "=== Without template ===";
2323+ print_endline "Input: <i><menu>Foo</i>";
2424+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<i><menu>Foo</i>") in
2525+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
2626+ print_newline ();
2727+2828+ (* Test 31 - foreignObject/math *)
2929+ print_endline "=== Test 31 - foreignObject ===";
3030+ print_endline "Input: <div><svg><path><foreignObject><math></div>a";
3131+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in
3232+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
3333+ print_newline ();
3434+3535+ (* namespace-sensitivity test *)
3636+ print_endline "=== Namespace sensitivity ===";
3737+ print_endline "Input: <body><table><tr><td><svg><td><foreignObject><span></td>Foo";
3838+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in
3939+ print_endline (Html5rw_dom.to_test_format (Parser.root result))
+40
test/frag_debug2.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+55+let () =
66+ (* Test: svg end tag handling *)
77+ print_endline "=== Test: <div><svg></div> ===";
88+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg></div>") in
99+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1010+ print_newline ();
1111+1212+ (* Test: foreignObject text integration *)
1313+ print_endline "=== Test: <div><svg><foreignObject></div> ===";
1414+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject></div>") in
1515+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1616+ print_newline ();
1717+1818+ (* Test: math inside foreignObject with end tag *)
1919+ print_endline "=== Test: <div><svg><foreignObject><math></div>a ===";
2020+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><math></div>a") in
2121+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
2222+ print_newline ();
2323+2424+ (* Without path element *)
2525+ print_endline "=== Test: <div><svg><foreignObject><b></div>text ===";
2626+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><b></div>text") in
2727+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
2828+ print_newline ();
2929+3030+ (* Template adoption agency test *)
3131+ print_endline "=== Test: <template><b><menu>text</b> ===";
3232+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><b><menu>text</b>") in
3333+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
3434+ print_newline ();
3535+3636+ (* Without template for comparison *)
3737+ print_endline "=== Test: <b><menu>text</b> (no template) ===";
3838+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<b><menu>text</b>") in
3939+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
4040+ print_newline ()
+34
test/frag_debug3.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+55+let () =
66+ (* Simple svg with child *)
77+ print_endline "=== Test: <svg><path></path></svg>text ===";
88+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><path></path></svg>text") in
99+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1010+ print_newline ();
1111+1212+ (* The failing test - foreignObject inside svg *)
1313+ print_endline "=== Test: <div><svg><path><foreignObject><math></div>a ===";
1414+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in
1515+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
1616+ print_newline ();
1717+1818+ (* Expected output for test 31:
1919+ <html>
2020+ <head>
2121+ <body>
2222+ <div>
2323+ <svg svg>
2424+ <svg path>
2525+ <svg foreignObject>
2626+ <math math>
2727+ "a"
2828+ *)
2929+3030+ (* Simple svg structure *)
3131+ print_endline "=== Test: <svg><rect/><circle/></svg> ===";
3232+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><rect/><circle/></svg>") in
3333+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
3434+ print_newline ()
+10
test/html_frag_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "Hello" in
88+ let context = Parser.make_fragment_context ~tag_name:"div" () in
99+ let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
1010+ print_endline (Dom.to_test_format (Parser.root result))
+22
test/nobr_debug.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let rec print_tree indent node =
77+ Printf.printf "%s%s (ns=%s, %d children)\n"
88+ indent
99+ node.Dom.name
1010+ (match node.Dom.namespace with Some s -> s | None -> "html")
1111+ (List.length node.Dom.children);
1212+ List.iter (print_tree (indent ^ " ")) node.Dom.children
1313+1414+let () =
1515+ let input = "<nobr>X" in
1616+ print_endline "Starting...";
1717+ let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
1818+ let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
1919+ print_endline "\nFinal tree structure:";
2020+ print_tree "" (Parser.root result);
2121+ print_endline "\nTest format:";
2222+ print_endline (Dom.to_test_format (Parser.root result))
+23
test/nobr_debug2.ml
···11+module Parser = Html5rw_parser
22+module Dom = Html5rw_dom
33+44+let rec print_tree indent node =
55+ Printf.printf "%s%s (ns=%s, %d children)\n"
66+ indent
77+ node.Dom.name
88+ (match node.Dom.namespace with Some s -> s | None -> "html")
99+ (List.length node.Dom.children);
1010+ List.iter (print_tree (indent ^ " ")) node.Dom.children
1111+1212+let () =
1313+ let input = "<nobr>X" in
1414+ print_endline "Starting...";
1515+ let context = { Parser.Tree_builder.tag_name = "path"; namespace = Some "svg" } in
1616+1717+ (* Create parser state directly for inspection *)
1818+ let t = Parser.Tree_builder.create ~collect_errors:true ~fragment_context:context input in
1919+ print_endline "\nInitial tree structure:";
2020+ print_tree "" t.Parser.Tree_builder.document;
2121+ print_endline "\nInitial stack size:";
2222+ Printf.printf "%d elements\n" (List.length t.Parser.Tree_builder.open_elements);
2323+ List.iter (fun n -> Printf.printf " - %s\n" n.Dom.name) t.Parser.Tree_builder.open_elements
+13
test/nobr_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<nobr>X" in
88+ print_endline "Starting...";
99+ let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
1010+ print_endline "Created context";
1111+ let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
1212+ print_endline "Parsed";
1313+ print_endline (Dom.to_test_format (Parser.root result))
+35
test/ns_sens_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+55+let () =
66+ print_endline "=== Test: <body><table><tr><td><svg><td><foreignObject><span></td>Foo ===";
77+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in
88+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
99+ print_newline ();
1010+1111+ (* Expected:
1212+ <html>
1313+ <head>
1414+ <body>
1515+ "Foo"
1616+ <table>
1717+ <tbody>
1818+ <tr>
1919+ <td>
2020+ <svg svg>
2121+ <svg td>
2222+ <svg foreignObject>
2323+ <span>
2424+ *)
2525+2626+ (* Let's also test simpler case *)
2727+ print_endline "=== Test: <table><td><svg><foreignObject></td>text ===";
2828+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td><svg><foreignObject></td>text") in
2929+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
3030+ print_newline ();
3131+3232+ print_endline "=== Test: <table><td></td>text ===";
3333+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td></td>text") in
3434+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
3535+ print_newline ()
+10
test/quick_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<nobr>X" in
88+ let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
99+ let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
1010+ print_endline (Dom.to_test_format (Parser.root result))
+22
test/script_attr_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ (* Test incomplete script tag with attribute *)
88+ let input = "<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt" in
99+ print_endline "=== Test: script tag with attribute at incomplete end ===";
1010+ print_endline ("Input: " ^ input);
1111+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1212+ print_endline "Result:";
1313+ print_endline (Dom.to_test_format (Parser.root result));
1414+ print_endline "";
1515+1616+ (* Test simpler case *)
1717+ let input = "<script type=text>X</script>" in
1818+ print_endline "=== Test: Complete script tag with attribute ===";
1919+ print_endline ("Input: " ^ input);
2020+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
2121+ print_endline "Result:";
2222+ print_endline (Dom.to_test_format (Parser.root result))
+12
test/script_eof_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ (* Test incomplete script tag *)
88+ let input = "<!doctype html><script><" in
99+ print_endline ("Input: " ^ input);
1010+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1111+ print_endline "Result:";
1212+ print_endline (Dom.to_test_format (Parser.root result))
+13
test/select_debug.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<select><b><option><select><option></b></select>X" in
88+ print_endline "Input:";
99+ print_endline input;
1010+ print_endline "";
1111+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1212+ print_endline "Result:";
1313+ print_endline (Dom.to_test_format (Parser.root result))
+9
test/simple_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<p>Hello</p>" in
88+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
99+ print_endline (Dom.to_test_format (Parser.root result))
+13
test/svg_frag_test.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "Hello" in
88+ print_endline "Starting...";
99+ let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
1010+ print_endline "Created context";
1111+ let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
1212+ print_endline "Parsed";
1313+ print_endline (Dom.to_test_format (Parser.root result))
+21
test/template_debug.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ (* Template test 45: div inside tr inside template *)
88+ let input1 = "<body><template><tr><div></div></tr></template>" in
99+ print_endline "=== Test 1 ===";
1010+ print_endline ("Input: " ^ input1);
1111+ let result1 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input1) in
1212+ print_endline "Result:";
1313+ print_endline (Dom.to_test_format (Parser.root result1));
1414+1515+ (* Template test 91: select inside tbody inside nested template *)
1616+ let input2 = "<template><template><tbody><select>" in
1717+ print_endline "\n=== Test 2 ===";
1818+ print_endline ("Input: " ^ input2);
1919+ let result2 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input2) in
2020+ print_endline "Result:";
2121+ print_endline (Dom.to_test_format (Parser.root result2))
+13
test/template_debug2.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ (* Test i then menu in template *)
88+ let input = "<template><i><menu>Foo" in
99+ print_endline "=== Test: i then menu in template ===";
1010+ print_endline ("Input: " ^ input);
1111+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1212+ print_endline "Result:";
1313+ print_endline (Dom.to_test_format (Parser.root result))
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+55+let () =
66+ print_endline "=== Test: <template><svg><foo><template><foreignObject><div></template><div> ===";
77+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><svg><foo><template><foreignObject><div></template><div>") in
88+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
99+ print_newline ();
1010+1111+ (* Expected:
1212+ <html>
1313+ <head>
1414+ <template>
1515+ content
1616+ <svg svg>
1717+ <svg foo>
1818+ <svg template>
1919+ <svg foreignObject>
2020+ <div>
2121+ <body>
2222+ <div>
2323+ *)
2424+2525+ (* Let's also test what happens with just the SVG template *)
2626+ print_endline "=== Test: <svg><template><foreignObject><div></template>text ===";
2727+ let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><template><foreignObject><div></template>text") in
2828+ print_endline (Html5rw_dom.to_test_format (Parser.root result));
2929+ print_newline ()
+14
test/test_debug.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let test input =
77+ print_endline ("Input: " ^ input);
88+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
99+ print_endline (Dom.to_test_format (Parser.root result));
1010+ print_endline ""
1111+1212+let () =
1313+ (* Frameset tests - exact test input *)
1414+ test "<frameset></frameset>\nfoo"
+15
test/test_frameset.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<param><frameset></frameset>" in
88+ print_endline ("Input: " ^ input);
99+ try
1010+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1111+ print_endline "Tree:";
1212+ print_endline (Dom.to_test_format (Parser.root result))
1313+ with e ->
1414+ print_endline ("Exception: " ^ Printexc.to_string e);
1515+ Printexc.print_backtrace stdout
+201
test/test_html5lib.ml
···11+(* Test runner for html5lib-tests tree construction tests *)
22+33+open Bytesrw
44+55+module Parser = Html5rw_parser
66+module Dom = Html5rw_dom
77+88+type test_case = {
99+ input : string;
1010+ expected_tree : string;
1111+ expected_errors : string list;
1212+ script_on : bool;
1313+ fragment_context : string option;
1414+}
1515+1616+let _is_blank s = String.trim s = ""
1717+1818+(* Parse a single test case from lines *)
1919+let parse_test_case lines =
2020+ let rec parse acc = function
2121+ | [] -> acc
2222+ | line :: rest when String.length line > 0 && line.[0] = '#' ->
2323+ let section = String.trim line in
2424+ let content, remaining = collect_section rest in
2525+ parse ((section, content) :: acc) remaining
2626+ | _ :: rest -> parse acc rest
2727+ and collect_section lines =
2828+ let rec loop acc = function
2929+ | [] -> (List.rev acc, [])
3030+ | line :: rest when String.length line > 0 && line.[0] = '#' ->
3131+ (List.rev acc, line :: rest)
3232+ | line :: rest -> loop (line :: acc) rest
3333+ in
3434+ loop [] lines
3535+ in
3636+ let sections = parse [] lines in
3737+3838+ let get_section name =
3939+ match List.assoc_opt name sections with
4040+ | Some lines -> String.concat "\n" lines
4141+ | None -> ""
4242+ in
4343+4444+ let data = get_section "#data" in
4545+ let document = get_section "#document" in
4646+ let errors_text = get_section "#errors" in
4747+ let errors =
4848+ String.split_on_char '\n' errors_text
4949+ |> List.filter (fun s -> String.trim s <> "")
5050+ in
5151+ let script_on = List.mem_assoc "#script-on" sections in
5252+ let fragment =
5353+ if List.mem_assoc "#document-fragment" sections then
5454+ Some (get_section "#document-fragment" |> String.trim)
5555+ else None
5656+ in
5757+5858+ {
5959+ input = data;
6060+ expected_tree = document;
6161+ expected_errors = errors;
6262+ script_on;
6363+ fragment_context = fragment;
6464+ }
6565+6666+(* Parse a .dat file into test cases *)
6767+let parse_dat_file content =
6868+ let lines = String.split_on_char '\n' content in
6969+ (* Split on empty lines followed by #data *)
7070+ let rec split_tests current acc = function
7171+ | [] ->
7272+ if current = [] then List.rev acc
7373+ else List.rev (List.rev current :: acc)
7474+ | "" :: "#data" :: rest ->
7575+ (* End of current test, start new one *)
7676+ let new_acc = if current = [] then acc else (List.rev current :: acc) in
7777+ split_tests ["#data"] new_acc rest
7878+ | line :: rest ->
7979+ split_tests (line :: current) acc rest
8080+ in
8181+ let test_groups = split_tests [] [] lines in
8282+ List.filter_map (fun lines ->
8383+ if List.exists (fun l -> l = "#data") lines then
8484+ Some (parse_test_case lines)
8585+ else None
8686+ ) test_groups
8787+8888+(* Strip "| " prefix from each line *)
8989+let strip_tree_prefix s =
9090+ let lines = String.split_on_char '\n' s in
9191+ let stripped = List.filter_map (fun line ->
9292+ if String.length line >= 2 && String.sub line 0 2 = "| " then
9393+ Some (String.sub line 2 (String.length line - 2))
9494+ else if String.trim line = "" then None
9595+ else Some line
9696+ ) lines in
9797+ String.concat "\n" stripped
9898+9999+(* Normalize tree output for comparison *)
100100+let normalize_tree s =
101101+ let lines = String.split_on_char '\n' s in
102102+ let non_empty = List.filter (fun l -> String.trim l <> "") lines in
103103+ String.concat "\n" non_empty
104104+105105+let run_test test =
106106+ try
107107+ let result =
108108+ match test.fragment_context with
109109+ | Some ctx_str ->
110110+ (* Parse "namespace element" or just "element" *)
111111+ let (namespace, tag_name) =
112112+ match String.split_on_char ' ' ctx_str with
113113+ | [ns; tag] when ns = "svg" -> (Some "svg", tag)
114114+ | [ns; tag] when ns = "math" -> (Some "mathml", tag)
115115+ | [tag] -> (None, tag)
116116+ | _ -> (None, ctx_str)
117117+ in
118118+ let context = Parser.make_fragment_context ~tag_name ~namespace () in
119119+ let reader = Bytes.Reader.of_string test.input in
120120+ Parser.parse ~collect_errors:true ~fragment_context:context reader
121121+ | None ->
122122+ let reader = Bytes.Reader.of_string test.input in
123123+ Parser.parse ~collect_errors:true reader
124124+ in
125125+ let actual_tree = Dom.to_test_format (Parser.root result) in
126126+ let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
127127+ let actual = normalize_tree (strip_tree_prefix actual_tree) in
128128+ (expected = actual, expected, actual, List.length (Parser.errors result), List.length test.expected_errors)
129129+ with e ->
130130+ let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
131131+ (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0)
132132+133133+let run_file path =
134134+ let ic = open_in path in
135135+ let content = really_input_string ic (in_channel_length ic) in
136136+ close_in ic;
137137+138138+ let tests = parse_dat_file content in
139139+ let filename = Filename.basename path in
140140+141141+ let passed = ref 0 in
142142+ let failed = ref 0 in
143143+ let errors = ref [] in
144144+145145+ List.iteri (fun i test ->
146146+ (* Skip script-on tests since we don't support scripting *)
147147+ if test.script_on then
148148+ () (* Skip this test *)
149149+ else begin
150150+ let (success, expected, actual, _actual_error_count, _expected_error_count) = run_test test in
151151+ if success then
152152+ incr passed
153153+ else begin
154154+ incr failed;
155155+ errors := (i + 1, test.input, expected, actual) :: !errors
156156+ end
157157+ end
158158+ ) tests;
159159+160160+ (!passed, !failed, List.rev !errors, filename)
161161+162162+let () =
163163+ let test_dir = Sys.argv.(1) in
164164+ let files = Sys.readdir test_dir |> Array.to_list in
165165+ let dat_files = List.filter (fun f ->
166166+ Filename.check_suffix f ".dat" &&
167167+ not (String.contains f '/') (* Skip subdirectories *)
168168+ ) files in
169169+170170+ let total_passed = ref 0 in
171171+ let total_failed = ref 0 in
172172+ let all_errors = ref [] in
173173+174174+ List.iter (fun file ->
175175+ let path = Filename.concat test_dir file in
176176+ if Sys.is_directory path then () else begin
177177+ let (passed, failed, errors, filename) = run_file path in
178178+ total_passed := !total_passed + passed;
179179+ total_failed := !total_failed + failed;
180180+ if errors <> [] then
181181+ all_errors := (filename, errors) :: !all_errors;
182182+ Printf.printf "%s: %d passed, %d failed\n" filename passed failed
183183+ end
184184+ ) (List.sort String.compare dat_files);
185185+186186+ Printf.printf "\n=== Summary ===\n";
187187+ Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
188188+189189+ if !all_errors <> [] then begin
190190+ Printf.printf "\n=== First failures ===\n";
191191+ List.iter (fun (filename, errors) ->
192192+ List.iter (fun (test_num, input, expected, actual) ->
193193+ Printf.printf "\n--- %s test %d ---\n" filename test_num;
194194+ Printf.printf "Input: %s\n" (String.escaped input);
195195+ Printf.printf "Expected:\n%s\n" expected;
196196+ Printf.printf "Actual:\n%s\n" actual
197197+ ) (List.filteri (fun i _ -> i < 3) errors)
198198+ ) (List.filteri (fun i _ -> i < 10) !all_errors)
199199+ end;
200200+201201+ exit (if !total_failed > 0 then 1 else 0)
+11
test/test_mi.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<!doctype html><p><math><mi><p><h1>" in
88+ print_endline ("Input: " ^ input);
99+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1010+ print_endline "Tree:";
1111+ print_endline (Dom.to_test_format (Parser.root result))
+9
test/test_table.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+let () =
66+ let input = "<b><em><foo><foo><aside></b>" in
77+ print_endline ("Input: " ^ input);
88+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
99+ print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_table_svg.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<table><tr><td><svg><desc><td></desc><circle>" in
88+ print_endline ("Input: " ^ input);
99+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1010+ print_endline "Tree:";
1111+ print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_whitespace.ml
···11+open Bytesrw
22+33+module Parser = Html5rw_parser
44+module Dom = Html5rw_dom
55+66+let () =
77+ let input = "<style> <!-- </style> --> </style>x" in
88+ print_endline ("Input: " ^ input);
99+ let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
1010+ print_endline "Tree:";
1111+ print_endline (Dom.to_test_format (Parser.root result))