···66(* HTML5 DOM serialization *)
7788open Bytesrw
99-open Node
99+open Dom_node
10101111(* Void elements that don't have end tags *)
1212let void_elements = [
···66 Bytes.get data 0 = '\xEF' &&
77 Bytes.get data 1 = '\xBB' &&
88 Bytes.get data 2 = '\xBF' then
99- Some (Encoding.Utf8, 3)
99+ Some (Encoding_types.Utf8, 3)
1010 else if len >= 2 &&
1111 Bytes.get data 0 = '\xFF' &&
1212 Bytes.get data 1 = '\xFE' then
1313- Some (Encoding.Utf16le, 2)
1313+ Some (Encoding_types.Utf16le, 2)
1414 else if len >= 2 &&
1515 Bytes.get data 0 = '\xFE' &&
1616 Bytes.get data 1 = '\xFF' then
1717- Some (Encoding.Utf16be, 2)
1717+ Some (Encoding_types.Utf16be, 2)
1818 else
1919 None
···51515252let decode_with_encoding data enc ~bom_len =
5353 match enc with
5454- | Encoding.Utf8 ->
5454+ | Encoding_types.Utf8 ->
5555 (* UTF-8: Just validate and replace errors with replacement character *)
5656 let len = Bytes.length data in
5757 let buf = Buffer.create len in
···7474 loop ();
7575 Buffer.contents buf
76767777- | Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
7878- | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
7777+ | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
7878+ | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
79798080- | Encoding.Windows_1252 ->
8080+ | Encoding_types.Windows_1252 ->
8181 (* Windows-1252 mapping table for 0x80-0x9F range *)
8282 let len = Bytes.length data in
8383 let buf = Buffer.create len in
···9898 done;
9999 Buffer.contents buf
100100101101- | Encoding.Iso_8859_2 ->
101101+ | Encoding_types.Iso_8859_2 ->
102102 (* Use uuuu for ISO-8859-2 decoding *)
103103 let len = Bytes.length data in
104104 let buf = Buffer.create len in
···109109 ) () s;
110110 Buffer.contents buf
111111112112- | Encoding.Euc_jp ->
112112+ | Encoding_types.Euc_jp ->
113113 (* For EUC-JP, use uutf with best effort *)
114114 let len = Bytes.length data in
115115 let buf = Buffer.create len in
···126126127127let decode data ?transport_encoding () =
128128 (* Step 1: Check for BOM *)
129129- let bom_result = Bom.sniff data in
129129+ let bom_result = Encoding_bom.sniff data in
130130 match bom_result with
131131 | Some (enc, bom_len) ->
132132 (decode_with_encoding data enc ~bom_len, enc)
···134134 (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
135135 let enc_from_transport =
136136 match transport_encoding with
137137- | Some te -> Labels.normalize_label te
137137+ | Some te -> Encoding_labels.normalize_label te
138138 | None -> None
139139 in
140140 match enc_from_transport with
141141 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
142142 | None ->
143143 (* Step 3: Prescan for meta charset *)
144144- match Prescan.prescan_for_meta_charset data with
144144+ match Encoding_prescan.prescan_for_meta_charset data with
145145 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
146146 | None ->
147147 (* Default to Windows-1252 per HTML5 spec when no encoding detected *)
148148- (decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252)
148148+ (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)
···88 else
99 (* Security: never allow utf-7 *)
1010 if s = "utf-7" || s = "utf7" || s = "x-utf-7" then
1111- Some Encoding.Windows_1252
1111+ Some Encoding_types.Windows_1252
1212 else if s = "utf-8" || s = "utf8" then
1313- Some Encoding.Utf8
1313+ Some Encoding_types.Utf8
1414 (* HTML treats latin-1 labels as windows-1252 *)
1515 else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" ||
1616 s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then
1717- Some Encoding.Windows_1252
1717+ Some Encoding_types.Windows_1252
1818 else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then
1919- Some Encoding.Windows_1252
1919+ Some Encoding_types.Windows_1252
2020 else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then
2121- Some Encoding.Iso_8859_2
2121+ Some Encoding_types.Iso_8859_2
2222 else if s = "euc-jp" || s = "eucjp" then
2323- Some Encoding.Euc_jp
2323+ Some Encoding_types.Euc_jp
2424 else if s = "utf-16" || s = "utf16" then
2525- Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *)
2525+ Some Encoding_types.Utf16le (* Default to LE for ambiguous utf-16 *)
2626 else if s = "utf-16le" || s = "utf16le" then
2727- Some Encoding.Utf16le
2727+ Some Encoding_types.Utf16le
2828 else if s = "utf-16be" || s = "utf16be" then
2929- Some Encoding.Utf16be
2929+ Some Encoding_types.Utf16be
3030 else
3131 None
3232···3737 (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
3838 treat them as UTF-8 *)
3939 match enc with
4040- | Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8
4040+ | Encoding_types.Utf16le | Encoding_types.Utf16be -> Some Encoding_types.Utf8
4141 | other -> Some other
···229229 (* Check for charset *)
230230 (match !charset with
231231 | Some cs ->
232232- (match Labels.normalize_meta_declared cs with
232232+ (match Encoding_labels.normalize_meta_declared cs with
233233 | Some enc -> result := Some enc
234234 | None -> ())
235235 | None -> ());
···241241 | Some he, Some ct when String.lowercase_ascii he = "content-type" ->
242242 (match extract_charset_from_content ct with
243243 | Some extracted ->
244244- (match Labels.normalize_meta_declared extracted with
244244+ (match Encoding_labels.normalize_meta_declared extracted with
245245 | Some enc -> result := Some enc
246246 | None -> ())
247247 | None -> ())
···5454 let digit_text = String.sub text digit_start (!j - digit_start) in
55555656 if String.length digit_text > 0 then begin
5757- match Numeric_ref.decode digit_text ~is_hex with
5757+ match Entities_numeric_ref.decode digit_text ~is_hex with
5858 | Some decoded ->
5959 Buffer.add_string buf decoded;
6060 i := if has_semicolon then !j + 1 else !j
···8484 (* Try exact match first (with semicolon expected) *)
8585 let decoded =
8686 if has_semicolon then
8787- Entity_table.lookup entity_name
8787+ Entities_entity_table.lookup entity_name
8888 else
8989 None
9090 in
···101101 if k <= 0 then None
102102 else
103103 let prefix = String.sub entity_name 0 k in
104104- if Entity_table.is_legacy prefix then
105105- match Entity_table.lookup prefix with
104104+ if Entities_entity_table.is_legacy prefix then
105105+ match Entities_entity_table.lookup prefix with
106106 | Some value -> Some (value, k)
107107 | None -> try_prefix (k - 1)
108108 else
···118118 i := !j + 1
119119 end else if not has_semicolon then begin
120120 (* Try without semicolon for legacy compatibility *)
121121- if Entity_table.is_legacy entity_name then
122122- match Entity_table.lookup entity_name with
121121+ if Entities_entity_table.is_legacy entity_name then
122122+ match Entities_entity_table.lookup entity_name with
123123 | Some value ->
124124 (* Legacy entities without semicolon have strict rules in attributes *)
125125 let next_char = if !j < len then Some text.[!j] else None in
···145145 if k <= 0 then None
146146 else
147147 let prefix = String.sub entity_name 0 k in
148148- if Entity_table.is_legacy prefix then
149149- match Entity_table.lookup prefix with
148148+ if Entities_entity_table.is_legacy prefix then
149149+ match Entities_entity_table.lookup prefix with
150150 | Some value -> Some (value, k)
151151 | None -> try_prefix (k - 1)
152152 else
···7788open Bytesrw
991010-module Dom = Html5rw_dom
1111-module Tokenizer = Html5rw_tokenizer
1212-module Encoding = Html5rw_encoding
1010+module Dom = Dom
1111+module Tokenizer = Tokenizer
1212+module Encoding = Encoding
13131414-type parse_error = Tree_builder.parse_error
1414+type parse_error = Parser_tree_builder.parse_error
15151616-type fragment_context = Tree_builder.fragment_context
1616+type fragment_context = Parser_tree_builder.fragment_context
17171818type t = {
1919 root : Dom.node;
···23232424(* Token sink that feeds tokens to tree builder *)
2525module TreeBuilderSink = struct
2626- type t = Tree_builder.t
2626+ type t = Parser_tree_builder.t
27272828 let process tb token =
2929- Tree_builder.process_token tb token;
2929+ Parser_tree_builder.process_token tb token;
3030 (* Check if we need to switch tokenizer state based on current element *)
3131 (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
3232- match Tree_builder.current_node tb with
3232+ match Parser_tree_builder.current_node tb with
3333 | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" ->
3434 let name = node.Dom.name in
3535 if List.mem name ["textarea"; "title"] then
3636- `SwitchTo Tokenizer.State.Rcdata
3636+ `SwitchTo Tokenizer_state.Rcdata
3737 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
3838- `SwitchTo Tokenizer.State.Rawtext
3838+ `SwitchTo Tokenizer_state.Rawtext
3939 else if name = "script" then
4040- `SwitchTo Tokenizer.State.Script_data
4040+ `SwitchTo Tokenizer_state.Script_data
4141 else if name = "plaintext" then
4242- `SwitchTo Tokenizer.State.Plaintext
4242+ `SwitchTo Tokenizer_state.Plaintext
4343 else
4444 `Continue
4545 | _ -> `Continue
46464747 let adjusted_current_node_in_html_namespace tb =
4848- Tree_builder.adjusted_current_node_in_html_namespace tb
4848+ Parser_tree_builder.adjusted_current_node_in_html_namespace tb
4949end
50505151(* Core parsing function that takes a Bytes.Reader.t *)
5252let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) =
5353- let tb = Tree_builder.create ~collect_errors ?fragment_context () in
5353+ let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in
5454 let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in
55555656 (* Set tokenizer state for fragment parsing *)
···6262 | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" ->
6363 let name = String.lowercase_ascii ctx.tag_name in
6464 if List.mem name ["title"; "textarea"] then
6565- Tokenizer.set_state tokenizer Tokenizer.State.Rcdata
6565+ Tokenizer.set_state tokenizer Tokenizer_state.Rcdata
6666 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
6767- Tokenizer.set_state tokenizer Tokenizer.State.Rawtext
6767+ Tokenizer.set_state tokenizer Tokenizer_state.Rawtext
6868 else if name = "script" then
6969- Tokenizer.set_state tokenizer Tokenizer.State.Script_data
6969+ Tokenizer.set_state tokenizer Tokenizer_state.Script_data
7070 else if name = "plaintext" then
7171- Tokenizer.set_state tokenizer Tokenizer.State.Plaintext
7171+ Tokenizer.set_state tokenizer Tokenizer_state.Plaintext
7272 | _ -> ());
73737474 Tokenizer.run tokenizer (module TreeBuilderSink) reader;
75757676- let root = Tree_builder.finish tb in
7676+ let root = Parser_tree_builder.finish tb in
7777 let tokenizer_errors = Tokenizer.get_errors tokenizer in
7878- let tree_errors = Tree_builder.get_errors tb in
7878+ let tree_errors = Parser_tree_builder.get_errors tb in
7979 let all_errors = List.map (fun e ->
8080- { Tree_builder.code = e.Tokenizer.Errors.code;
8080+ { Parser_tree_builder.code = e.Tokenizer.Errors.code;
8181 line = e.Tokenizer.Errors.line;
8282 column = e.Tokenizer.Errors.column }
8383 ) tokenizer_errors @ tree_errors in
···9292 { result with encoding = Some enc }
93939494let query t selector =
9595- Html5rw_selector.query t.root selector
9595+ Selector.query t.root selector
96969797(* Serialize to a Bytes.Writer.t *)
9898let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
···131131 @raise Selector_error if the selector is malformed.
132132*)
133133134134-val query : Html5rw_dom.node -> string -> Html5rw_dom.node list
134134+val query : Dom.node -> string -> Dom.node list
135135(** Query the DOM tree with a CSS selector.
136136137137 Returns all nodes matching the selector in document order.
···143143 ]}
144144*)
145145146146-val matches : Html5rw_dom.node -> string -> bool
146146+val matches : Dom.node -> string -> bool
147147(** Check if a node matches a CSS selector.
148148149149 @raise Selector_error if the selector is malformed.
···15151616(** Token types produced by the tokenizer. *)
1717module Token : sig
1818- type tag_kind = Token.tag_kind = Start | End
1818+ type tag_kind = Tokenizer_token.tag_kind = Start | End
19192020- type doctype = Token.doctype = {
2020+ type doctype = Tokenizer_token.doctype = {
2121 name : string option;
2222 public_id : string option;
2323 system_id : string option;
2424 force_quirks : bool;
2525 }
26262727- type tag = Token.tag = {
2727+ type tag = Tokenizer_token.tag = {
2828 kind : tag_kind;
2929 name : string;
3030 attrs : (string * string) list;
3131 self_closing : bool;
3232 }
33333434- type t = Token.t =
3434+ type t = Tokenizer_token.t =
3535 | Tag of tag
3636 | Character of string
3737 | Comment of string
···54545555(** Tokenizer states. *)
5656module State : sig
5757- type t = State.t =
5757+ type t = Tokenizer_state.t =
5858 | Data
5959 | Rcdata
6060 | Rawtext
···139139140140(** Parse error types. *)
141141module Errors : sig
142142- type t = Errors.t = {
142142+ type t = Tokenizer_errors.t = {
143143 code : string;
144144 line : int;
145145 column : int;
···151151152152(** Input stream with position tracking. *)
153153module Stream : sig
154154- type t = Stream.t
154154+ type t = Tokenizer_stream.t
155155156156 val create : string -> t
157157 val create_from_reader : Bytesrw.Bytes.Reader.t -> t
···170170*)
171171module type SINK = sig
172172 type t
173173- val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
173173+ val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
174174 val adjusted_current_node_in_html_namespace : t -> bool
175175end
176176···204204 function for each token until EOF is reached.
205205*)
206206207207-val get_errors : 'sink t -> Errors.t list
207207+val get_errors : 'sink t -> Tokenizer_errors.t list
208208(** Get the list of parse errors encountered during tokenization.
209209210210 Only populated if [collect_errors:true] was passed to {!create}.
211211*)
212212213213-val set_state : 'sink t -> State.t -> unit
213213+val set_state : 'sink t -> Tokenizer_state.t -> unit
214214(** Set the tokenizer state.
215215216216 Used by the tree builder to switch states for raw text elements.