···186186 match Prescan.prescan_for_meta_charset data with
187187 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
188188 | None ->
189189- (* Default to UTF-8 *)
190190- (decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8)
189189+ (* Default to Windows-1252 per HTML5 spec when no encoding detected *)
190190+ (decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252)
+8-5
lib/encoding/prescan.ml
···9797 if !j + 2 < len then
9898 i := !j + 3
9999 else
100100- result := None (* Unclosed comment, stop scanning *)
100100+ i := len (* Unclosed comment - stop scanning *)
101101 end
102102 (* Check for end tag - skip it *)
103103 else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin
104104 let j = ref (!i + 2) in
105105 let in_quote = ref None in
106106- while !j < len && !j < max_total && !non_comment < max_non_comment do
106106+ let done_tag = ref false in
107107+ while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
107108 let c = Bytes.get data !j in
108109 match !in_quote with
109110 | None ->
···114115 end else if c = '>' then begin
115116 incr j;
116117 incr non_comment;
117117- j := len (* Exit loop *)
118118+ done_tag := true
118119 end else begin
119120 incr j;
120121 incr non_comment
···138139 if tag_name <> "meta" then begin
139140 (* Skip non-meta tag *)
140141 let in_quote = ref None in
141141- while !j < len && !j < max_total && !non_comment < max_non_comment do
142142+ let done_tag = ref false in
143143+ while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
142144 let c = Bytes.get data !j in
143145 match !in_quote with
144146 | None ->
···149151 end else if c = '>' then begin
150152 incr j;
151153 incr non_comment;
152152- j := len
154154+ done_tag := true
153155 end else begin
154156 incr j;
155157 incr non_comment
···240242 | None -> ());
241243242244 (* Check for http-equiv="content-type" with content *)
245245+ (* Note: http-equiv value must be exactly "content-type" (case-insensitive) *)
243246 if !result = None then
244247 (match !http_equiv, !content with
245248 | Some he, Some ct when String.lowercase_ascii he = "content-type" ->
+108
lib/tokenizer/stream.ml
···2222 mutable last_was_cr : bool;
2323 (* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
2424 mutable skip_next_lf : bool;
2525+ (* Error callback for surrogate/noncharacter detection *)
2626+ mutable error_callback : (string -> unit) option;
2527}
26282729(* Create a stream from a Bytes.Reader.t *)
···3638 column = 0;
3739 last_was_cr = false;
3840 skip_next_lf = false;
4141+ error_callback = None;
3942 }
40434444+let set_error_callback t cb =
4545+ t.error_callback <- Some cb
4646+4747+(* Check if a Unicode codepoint is a surrogate *)
4848+let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF
4949+5050+(* Check if a Unicode codepoint is a noncharacter *)
5151+let is_noncharacter cp =
5252+ (* U+FDD0 to U+FDEF *)
5353+ (cp >= 0xFDD0 && cp <= 0xFDEF) ||
5454+ (* U+FFFE and U+FFFF in each plane (0-16) *)
5555+ ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF)
5656+4157(* Create a stream from a string - discouraged, prefer create_from_reader *)
4258let create input =
4359 create_from_reader (Bytes.Reader.of_string input)
···7894let push_back_char t c =
7995 t.lookahead <- c :: t.lookahead
80969797+(* Check for surrogates and noncharacters in UTF-8 sequences.
9898+ Called after reading a lead byte, peeks continuation bytes to decode codepoint. *)
9999+let check_utf8_codepoint t lead_byte =
100100+ let b0 = Char.code lead_byte in
101101+ if b0 < 0x80 then
102102+ (* ASCII - no surrogates or noncharacters possible in this range except control chars *)
103103+ ()
104104+ else if b0 >= 0xC2 && b0 <= 0xDF then begin
105105+ (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
106106+ (* Check for C1 control characters U+0080-U+009F *)
107107+ match read_raw_char t with
108108+ | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
109109+ let b1 = Char.code c1 in
110110+ let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in
111111+ push_back_char t c1;
112112+ (* C1 controls: U+0080 to U+009F *)
113113+ if cp >= 0x80 && cp <= 0x9F then
114114+ (match t.error_callback with
115115+ | Some cb -> cb "control-character-in-input-stream"
116116+ | None -> ())
117117+ | Some c1 ->
118118+ push_back_char t c1
119119+ | None -> ()
120120+ end else if b0 >= 0xE0 && b0 <= 0xEF then begin
121121+ (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *)
122122+ (* Need to peek 2 continuation bytes *)
123123+ match read_raw_char t with
124124+ | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
125125+ let b1 = Char.code c1 in
126126+ (match read_raw_char t with
127127+ | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
128128+ let b2 = Char.code c2 in
129129+ let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
130130+ push_back_char t c2;
131131+ push_back_char t c1;
132132+ (* Check for surrogates and noncharacters *)
133133+ (match t.error_callback with
134134+ | Some cb ->
135135+ if is_surrogate cp then cb "surrogate-in-input-stream"
136136+ else if is_noncharacter cp then cb "noncharacter-in-input-stream"
137137+ | None -> ())
138138+ | Some c2 ->
139139+ push_back_char t c2;
140140+ push_back_char t c1
141141+ | None ->
142142+ push_back_char t c1)
143143+ | Some c1 ->
144144+ push_back_char t c1
145145+ | None -> ()
146146+ end else if b0 >= 0xF0 && b0 <= 0xF4 then begin
147147+ (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *)
148148+ match read_raw_char t with
149149+ | Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
150150+ let b1 = Char.code c1 in
151151+ (match read_raw_char t with
152152+ | Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
153153+ let b2 = Char.code c2 in
154154+ (match read_raw_char t with
155155+ | Some c3 when (Char.code c3 land 0xC0) = 0x80 ->
156156+ let b3 = Char.code c3 in
157157+ let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
158158+ ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
159159+ push_back_char t c3;
160160+ push_back_char t c2;
161161+ push_back_char t c1;
162162+ (* Check for noncharacters (no surrogates in 4-byte range) *)
163163+ (match t.error_callback with
164164+ | Some cb ->
165165+ if is_noncharacter cp then cb "noncharacter-in-input-stream"
166166+ | None -> ())
167167+ | Some c3 ->
168168+ push_back_char t c3;
169169+ push_back_char t c2;
170170+ push_back_char t c1
171171+ | None ->
172172+ push_back_char t c2;
173173+ push_back_char t c1)
174174+ | Some c2 ->
175175+ push_back_char t c2;
176176+ push_back_char t c1
177177+ | None ->
178178+ push_back_char t c1)
179179+ | Some c1 ->
180180+ push_back_char t c1
181181+ | None -> ()
182182+ end
183183+81184(* Read next char with CR/LF normalization *)
82185let rec read_normalized_char t =
186186+ (* Track if we're reading from lookahead - if so, we've already checked this byte *)
187187+ let from_lookahead = t.lookahead <> [] in
83188 match read_raw_char t with
84189 | None ->
85190 t.last_was_cr <- false;
···98203 read_normalized_char t
99204 | Some c ->
100205 t.last_was_cr <- false;
206206+ (* Only check for surrogates/noncharacters when reading fresh from stream,
207207+ not when re-reading from lookahead (to avoid duplicate errors) *)
208208+ if not from_lookahead then check_utf8_codepoint t c;
101209 Some c
102210103211let is_eof t =
+105-9
lib/tokenizer/tokenizer.ml
···3939 mutable pending_chars : Buffer.t;
4040 mutable errors : Errors.t list;
4141 collect_errors : bool;
4242+ xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
4243}
43444444-let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = {
4545+let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
4546 stream = Stream.create "";
4647 sink;
4748 state = State.Data;
···6364 pending_chars = Buffer.create 256;
6465 errors = [];
6566 collect_errors;
6767+ xml_mode;
6668}
67696870let error t code =
···73757476(* emit functions are defined locally inside run *)
75777878+(* XML mode character transformation: form feed → space *)
7679let emit_char t c =
7777- Buffer.add_char t.pending_chars c
8080+ if t.xml_mode && c = '\x0C' then
8181+ Buffer.add_char t.pending_chars ' '
8282+ else
8383+ Buffer.add_char t.pending_chars c
78848585+(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
7986let emit_str t s =
8080- Buffer.add_string t.pending_chars s
8787+ if t.xml_mode then begin
8888+ (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
8989+ let len = String.length s in
9090+ let i = ref 0 in
9191+ while !i < len do
9292+ let c = s.[!i] in
9393+ if c = '\x0C' then begin
9494+ Buffer.add_char t.pending_chars ' ';
9595+ incr i
9696+ end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
9797+ (* U+FFFF → U+FFFD *)
9898+ Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
9999+ i := !i + 3
100100+ end else begin
101101+ Buffer.add_char t.pending_chars c;
102102+ incr i
103103+ end
104104+ done
105105+ end else
106106+ Buffer.add_string t.pending_chars s
8110782108let start_new_tag t kind =
83109 Buffer.clear t.current_tag_name;
···130156let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
131157 t.stream <- Stream.create_from_reader reader;
132158 t.errors <- [];
159159+ (* Set up error callback for surrogate/noncharacter detection in stream *)
160160+ (* In XML mode, we don't report noncharacter errors - we transform them instead *)
161161+ if not t.xml_mode then
162162+ Stream.set_error_callback t.stream (fun code -> error t code);
163163+164164+ (* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
165165+ let transform_xml_chars data =
166166+ let len = String.length data in
167167+ let buf = Buffer.create len in
168168+ let i = ref 0 in
169169+ while !i < len do
170170+ let c = data.[!i] in
171171+ if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
172172+ (* U+FFFF → U+FFFD *)
173173+ Buffer.add_string buf "\xEF\xBF\xBD";
174174+ i := !i + 3
175175+ end else begin
176176+ Buffer.add_char buf c;
177177+ incr i
178178+ end
179179+ done;
180180+ Buffer.contents buf
181181+ in
133182134183 (* Local emit functions with access to S *)
135184 let emit_pending_chars () =
136185 if Buffer.length t.pending_chars > 0 then begin
137186 let data = Buffer.contents t.pending_chars in
138187 Buffer.clear t.pending_chars;
188188+ let data = if t.xml_mode then transform_xml_chars data else data in
139189 ignore (S.process t.sink (Token.Character data))
140190 end
141191 in
···180230 in
181231182232 let emit_current_comment () =
183183- emit (Token.Comment (Buffer.contents t.current_comment))
233233+ let content = Buffer.contents t.current_comment in
234234+ let content =
235235+ if t.xml_mode then begin
236236+ (* XML mode: transform -- to - - in comments *)
237237+ let buf = Buffer.create (String.length content + 10) in
238238+ let len = String.length content in
239239+ let i = ref 0 in
240240+ while !i < len do
241241+ if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
242242+ Buffer.add_string buf "- -";
243243+ i := !i + 2
244244+ end else begin
245245+ Buffer.add_char buf content.[!i];
246246+ incr i
247247+ end
248248+ done;
249249+ Buffer.contents buf
250250+ end else content
251251+ in
252252+ emit (Token.Comment content)
184253 in
185254186255 (* Check for control characters and emit error if needed *)
256256+ (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
187257 let check_control_char c =
188258 let code = Char.code c in
189189- (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *)
259259+ (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
190260 (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
261261+ (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
262262+ (* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
191263 if (code >= 0x01 && code <= 0x08) ||
192264 code = 0x0B ||
193265 (code >= 0x0E && code <= 0x1F) ||
194194- (code >= 0x7F && code <= 0x9F) then
266266+ code = 0x7F then
195267 error t "control-character-in-input-stream"
196268 in
269269+197270198271 (* Emit char with control character check *)
199272 let emit_char_checked c =
···294367 | State.Script_data_escaped
295368 | State.Script_data_escaped_dash
296369 | State.Script_data_escaped_dash_dash ->
370370+ error t "eof-in-script-html-comment-like-text";
297371 emit_pending_chars ();
298372 ignore (S.process t.sink Token.EOF)
299373 | State.Script_data_escaped_less_than_sign ->
···313387 | State.Script_data_double_escaped
314388 | State.Script_data_double_escaped_dash
315389 | State.Script_data_double_escaped_dash_dash ->
390390+ error t "eof-in-script-html-comment-like-text";
316391 emit_pending_chars ();
317392 ignore (S.process t.sink Token.EOF)
318393 | State.Script_data_double_escaped_less_than_sign ->
···647722 error t "unexpected-null-character";
648723 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
649724 | Some c ->
725725+ check_control_char c;
650726 Buffer.add_char t.current_tag_name (ascii_lower c)
651727 | None -> ()
652728···10151091 Buffer.add_char t.current_attr_name (Option.get c_opt)
10161092 | Some c ->
10171093 Stream.advance t.stream;
10941094+ check_control_char c;
10181095 Buffer.add_char t.current_attr_name (ascii_lower c)
1019109610201097 and state_after_attribute_name () =
···10651142 error t "unexpected-null-character";
10661143 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
10671144 | Some c ->
11451145+ check_control_char c;
10681146 Buffer.add_char t.current_attr_value c
10691147 | None -> ()
10701148···10791157 error t "unexpected-null-character";
10801158 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
10811159 | Some c ->
11601160+ check_control_char c;
10821161 Buffer.add_char t.current_attr_value c
10831162 | None -> ()
10841163···11051184 Buffer.add_char t.current_attr_value (Option.get c_opt)
11061185 | Some c ->
11071186 Stream.advance t.stream;
11871187+ check_control_char c;
11081188 Buffer.add_char t.current_attr_value c
11091189 | None -> ()
11101190···11461226 error t "unexpected-null-character";
11471227 Buffer.add_string t.current_comment "\xEF\xBF\xBD"
11481228 | Some c ->
12291229+ check_control_char c;
11491230 Buffer.add_char t.current_comment c
11501231 | None -> ()
11511232···12121293 error t "unexpected-null-character";
12131294 Buffer.add_string t.current_comment "\xEF\xBF\xBD"
12141295 | Some c ->
12961296+ check_control_char c;
12151297 Buffer.add_char t.current_comment c
12161298 | None -> ()
12171299···13271409 | None -> ()
13281410 | Some c ->
13291411 Stream.advance t.stream;
14121412+ check_control_char c;
13301413 start_new_doctype t;
13311414 t.current_doctype_name <- Some (Buffer.create 8);
13321415 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
···13431426 error t "unexpected-null-character";
13441427 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
13451428 | Some c ->
14291429+ check_control_char c;
13461430 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
13471431 | None -> ()
13481432···13561440 emit_current_doctype ()
13571441 | None -> ()
13581442 | Some _ ->
14431443+ (* Don't check control char here - bogus_doctype will check when it consumes *)
13591444 if Stream.matches_ci t.stream "PUBLIC" then begin
13601445 ignore (Stream.consume_exact_ci t.stream "PUBLIC");
13611446 t.state <- State.After_doctype_public_keyword
···13911476 emit_current_doctype ()
13921477 | None -> ()
13931478 | Some _ ->
14791479+ (* Don't check control char here - bogus_doctype will check when it consumes *)
13941480 error t "missing-quote-before-doctype-public-identifier";
13951481 t.current_doctype_force_quirks <- true;
13961482 t.state <- State.Bogus_doctype
···14321518 t.state <- State.Data;
14331519 emit_current_doctype ()
14341520 | Some c ->
15211521+ check_control_char c;
14351522 Buffer.add_char (Option.get t.current_doctype_public) c
14361523 | None -> ()
14371524···14481535 t.state <- State.Data;
14491536 emit_current_doctype ()
14501537 | Some c ->
15381538+ check_control_char c;
14511539 Buffer.add_char (Option.get t.current_doctype_public) c
14521540 | None -> ()
14531541···14721560 t.state <- State.Doctype_system_identifier_single_quoted
14731561 | None -> ()
14741562 | Some _ ->
15631563+ (* Don't check control char here - bogus_doctype will check when it consumes *)
14751564 error t "missing-quote-before-doctype-system-identifier";
14761565 t.current_doctype_force_quirks <- true;
14771566 t.state <- State.Bogus_doctype
···14941583 t.state <- State.Doctype_system_identifier_single_quoted
14951584 | None -> ()
14961585 | Some _ ->
15861586+ (* Don't check control char here - bogus_doctype will check when it consumes *)
14971587 error t "missing-quote-before-doctype-system-identifier";
14981588 t.current_doctype_force_quirks <- true;
14991589 t.state <- State.Bogus_doctype
···15211611 emit_current_doctype ()
15221612 | None -> ()
15231613 | Some _ ->
16141614+ (* Don't check control char here - bogus_doctype will check when it consumes *)
15241615 error t "missing-quote-before-doctype-system-identifier";
15251616 t.current_doctype_force_quirks <- true;
15261617 t.state <- State.Bogus_doctype
···15451636 emit_current_doctype ()
15461637 | None -> ()
15471638 | Some _ ->
16391639+ (* Don't check control char here - bogus_doctype will check when it consumes *)
15481640 error t "missing-quote-before-doctype-system-identifier";
15491641 t.current_doctype_force_quirks <- true;
15501642 t.state <- State.Bogus_doctype
···15621654 t.state <- State.Data;
15631655 emit_current_doctype ()
15641656 | Some c ->
16571657+ check_control_char c;
15651658 Buffer.add_char (Option.get t.current_doctype_system) c
15661659 | None -> ()
15671660···15781671 t.state <- State.Data;
15791672 emit_current_doctype ()
15801673 | Some c ->
16741674+ check_control_char c;
15811675 Buffer.add_char (Option.get t.current_doctype_system) c
15821676 | None -> ()
15831677···15911685 emit_current_doctype ()
15921686 | None -> ()
15931687 | Some _ ->
16881688+ (* Don't check control char here - bogus_doctype will check when it consumes *)
15941689 error t "unexpected-character-after-doctype-system-identifier";
15951690 t.state <- State.Bogus_doctype
15961691···16011696 emit_current_doctype ()
16021697 | Some '\x00' ->
16031698 error t "unexpected-null-character"
16041604- | Some _ -> ()
16991699+ | Some c ->
17001700+ check_control_char c (* Check all chars in bogus doctype *)
16051701 | None -> ()
1606170216071703 and state_cdata_section () =
···16091705 | Some ']' ->
16101706 t.state <- State.Cdata_section_bracket
16111707 | Some c ->
16121612- (* CDATA section emits all characters as-is, including NUL *)
16131613- emit_char t c
17081708+ (* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
17091709+ emit_char_checked c
16141710 | None -> ()
1615171116161712 and state_cdata_section_bracket () =
+577-66
test/test_serializer.ml
···1212 | Jsont.String (s, _) -> Some s
1313 | _ -> failwith "Expected string or null"
14141515+let json_bool = function
1616+ | Jsont.Bool (b, _) -> b
1717+ | _ -> failwith "Expected bool"
1818+1519let json_array = function
1620 | Jsont.Array (arr, _) -> arr
1721 | _ -> failwith "Expected array"
···3034 | Some v -> v
3135 | None -> failwith ("Missing member: " ^ name)
32363737+(* Serialization options *)
3838+type serialize_options = {
3939+ quote_char : char;
4040+ quote_char_explicit : bool; (* Was quote_char explicitly set? *)
4141+ minimize_boolean_attributes : bool;
4242+ use_trailing_solidus : bool;
4343+ escape_lt_in_attrs : bool;
4444+ escape_rcdata : bool;
4545+ strip_whitespace : bool;
4646+ inject_meta_charset : bool;
4747+ encoding : string option;
4848+ omit_optional_tags : bool;
4949+}
5050+5151+let default_options = {
5252+ quote_char = '"';
5353+ quote_char_explicit = false;
5454+ minimize_boolean_attributes = true;
5555+ use_trailing_solidus = false;
5656+ escape_lt_in_attrs = false;
5757+ escape_rcdata = false;
5858+ strip_whitespace = false;
5959+ inject_meta_charset = false;
6060+ encoding = None;
6161+ omit_optional_tags = true; (* HTML5 default *)
6262+}
6363+6464+(* Parse options from JSON *)
6565+let parse_options json_opt =
6666+ match json_opt with
6767+ | None -> default_options
6868+ | Some json ->
6969+ let obj = json_object json in
7070+ let get_bool name default =
7171+ match json_mem name obj with
7272+ | Some j -> (try json_bool j with _ -> default)
7373+ | None -> default
7474+ in
7575+ let get_string name =
7676+ match json_mem name obj with
7777+ | Some (Jsont.String (s, _)) -> Some s
7878+ | _ -> None
7979+ in
8080+ let quote_char_opt =
8181+ match json_mem "quote_char" obj with
8282+ | Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0]
8383+ | _ -> None
8484+ in
8585+ {
8686+ quote_char = Option.value ~default:'"' quote_char_opt;
8787+ quote_char_explicit = Option.is_some quote_char_opt;
8888+ minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true);
8989+ use_trailing_solidus = get_bool "use_trailing_solidus" false;
9090+ escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false;
9191+ escape_rcdata = get_bool "escape_rcdata" false;
9292+ strip_whitespace = get_bool "strip_whitespace" false;
9393+ inject_meta_charset = get_bool "inject_meta_charset" false;
9494+ encoding = get_string "encoding";
9595+ omit_optional_tags = get_bool "omit_optional_tags" true;
9696+ }
9797+3398(* Test case *)
3499type test_case = {
35100 description : string;
36101 input : Jsont.json list;
37102 expected : string list;
103103+ options : serialize_options;
38104}
3910540106let parse_test_case json =
···42108 let description = json_string (json_mem_exn "description" obj) in
43109 let input = json_array (json_mem_exn "input" obj) in
44110 let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in
4545- { description; input; expected }
111111+ let options = parse_options (json_mem "options" obj) in
112112+ { description; input; expected; options }
113113+114114+(* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *)
115115+let parse_attrs attrs_json =
116116+ match attrs_json with
117117+ | Jsont.Array (arr, _) ->
118118+ List.map (fun attr_json ->
119119+ let attr_obj = json_object attr_json in
120120+ let attr_name = json_string (json_mem_exn "name" attr_obj) in
121121+ let value = json_string (json_mem_exn "value" attr_obj) in
122122+ (attr_name, value)
123123+ ) arr
124124+ | Jsont.Object (obj, _) ->
125125+ List.map (fun ((n, _), v) -> (n, json_string v)) obj
126126+ | _ -> []
461274747-(* Build a DOM node from test input token *)
4848-let build_node_from_token token =
128128+(* Void elements that don't need end tags *)
129129+let is_void_element name =
130130+ List.mem (String.lowercase_ascii name)
131131+ ["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
132132+ "link"; "meta"; "param"; "source"; "track"; "wbr"]
133133+134134+(* Raw text elements whose content should not be escaped *)
135135+let is_raw_text_element name =
136136+ List.mem (String.lowercase_ascii name) ["script"; "style"]
137137+138138+(* Elements where whitespace should be preserved *)
139139+let is_whitespace_preserving_element name =
140140+ List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"]
141141+142142+(* Block elements that close a p tag *)
143143+let p_closing_elements = [
144144+ "address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir";
145145+ "div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
146146+ "header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section";
147147+ "table"; "ul"
148148+]
149149+150150+let is_p_closing_element name =
151151+ List.mem (String.lowercase_ascii name) p_closing_elements
152152+153153+(* Collapse runs of whitespace to single space *)
154154+let collapse_whitespace text =
155155+ let len = String.length text in
156156+ let buf = Buffer.create len in
157157+ let in_whitespace = ref false in
158158+ for i = 0 to len - 1 do
159159+ let c = text.[i] in
160160+ if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin
161161+ if not !in_whitespace then begin
162162+ Buffer.add_char buf ' ';
163163+ in_whitespace := true
164164+ end
165165+ end else begin
166166+ Buffer.add_char buf c;
167167+ in_whitespace := false
168168+ end
169169+ done;
170170+ Buffer.contents buf
171171+172172+(* Token types for stream-based serialization *)
173173+type token_type =
174174+ | StartTag of string * (string * string) list (* name, attrs *)
175175+ | EndTag of string (* name *)
176176+ | EmptyTag of string * (string * string) list (* name, attrs *)
177177+ | TextNode of string
178178+ | CommentNode of string
179179+ | DoctypeNode of Dom.node
180180+181181+type token_info = {
182182+ token : token_type option;
183183+ node : Dom.node option; (* Legacy for compatibility *)
184184+ tag_name : string option;
185185+ is_empty_tag : bool;
186186+}
187187+188188+let build_token_info token =
49189 let arr = json_array token in
50190 match arr with
5151- | [] -> None
191191+ | [] -> { token = None; node = None; tag_name = None; is_empty_tag = false }
52192 | type_json :: rest ->
5353- let token_type = json_string type_json in
5454- match token_type, rest with
193193+ let token_type_str = json_string type_json in
194194+ match token_type_str, rest with
55195 | "StartTag", [_ns_json; name_json; attrs_json] ->
56196 let name = json_string name_json in
5757- let attrs_list = json_array attrs_json in
5858- let attrs = List.map (fun attr_json ->
5959- let attr_obj = json_object attr_json in
6060- let attr_name = json_string (json_mem_exn "name" attr_obj) in
6161- let value = json_string (json_mem_exn "value" attr_obj) in
6262- (attr_name, value)
6363- ) attrs_list in
6464- Some (Dom.create_element name ~attrs ())
197197+ let attrs = parse_attrs attrs_json in
198198+ { token = Some (StartTag (name, attrs));
199199+ node = Some (Dom.create_element name ~attrs ());
200200+ tag_name = Some name;
201201+ is_empty_tag = false }
6520266203 | "StartTag", [name_json; attrs_json] ->
67204 let name = json_string name_json in
6868- let attrs_obj = json_object attrs_json in
6969- let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
7070- Some (Dom.create_element name ~attrs ())
205205+ let attrs = parse_attrs attrs_json in
206206+ { token = Some (StartTag (name, attrs));
207207+ node = Some (Dom.create_element name ~attrs ());
208208+ tag_name = Some name;
209209+ is_empty_tag = false }
7121072211 | "EmptyTag", [name_json; attrs_json] ->
73212 let name = json_string name_json in
7474- let attrs_obj = json_object attrs_json in
7575- let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
7676- Some (Dom.create_element name ~attrs ())
213213+ let attrs = parse_attrs attrs_json in
214214+ { token = Some (EmptyTag (name, attrs));
215215+ node = Some (Dom.create_element name ~attrs ());
216216+ tag_name = Some name;
217217+ is_empty_tag = true }
218218+219219+ | "EndTag", [_ns_json; name_json] ->
220220+ let name = json_string name_json in
221221+ { token = Some (EndTag name);
222222+ node = None;
223223+ tag_name = Some name;
224224+ is_empty_tag = false }
225225+226226+ | "EndTag", [name_json] ->
227227+ let name = json_string name_json in
228228+ { token = Some (EndTag name);
229229+ node = None;
230230+ tag_name = Some name;
231231+ is_empty_tag = false }
7723278233 | "Characters", [text_json] ->
79234 let text = json_string text_json in
8080- Some (Dom.create_text text)
235235+ { token = Some (TextNode text);
236236+ node = Some (Dom.create_text text);
237237+ tag_name = None;
238238+ is_empty_tag = false }
8123982240 | "Comment", [text_json] ->
83241 let text = json_string text_json in
8484- Some (Dom.create_comment text)
242242+ { token = Some (CommentNode text);
243243+ node = Some (Dom.create_comment text);
244244+ tag_name = None;
245245+ is_empty_tag = false }
8524686247 | "Doctype", [name_json] ->
87248 let name = json_string name_json in
8888- Some (Dom.create_doctype ~name ())
249249+ let node = Dom.create_doctype ~name () in
250250+ { token = Some (DoctypeNode node);
251251+ node = Some node;
252252+ tag_name = None;
253253+ is_empty_tag = false }
8925490255 | "Doctype", [name_json; public_json] ->
91256 let name = json_string name_json in
92257 let public_id = json_string_opt public_json in
9393- (match public_id with
9494- | Some pub -> Some (Dom.create_doctype ~name ~public_id:pub ())
9595- | None -> Some (Dom.create_doctype ~name ()))
258258+ let node = match public_id with
259259+ | Some pub -> Dom.create_doctype ~name ~public_id:pub ()
260260+ | None -> Dom.create_doctype ~name ()
261261+ in
262262+ { token = Some (DoctypeNode node);
263263+ node = Some node;
264264+ tag_name = None;
265265+ is_empty_tag = false }
9626697267 | "Doctype", [name_json; public_json; system_json] ->
98268 let name = json_string name_json in
99269 let public_id = json_string_opt public_json in
100270 let system_id = json_string_opt system_json in
101101- (match public_id, system_id with
102102- | Some pub, Some sys -> Some (Dom.create_doctype ~name ~public_id:pub ~system_id:sys ())
103103- | Some pub, None -> Some (Dom.create_doctype ~name ~public_id:pub ())
104104- | None, Some sys -> Some (Dom.create_doctype ~name ~system_id:sys ())
105105- | None, None -> Some (Dom.create_doctype ~name ()))
271271+ let node = match public_id, system_id with
272272+ | Some pub, Some sys -> Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
273273+ | Some pub, None -> Dom.create_doctype ~name ~public_id:pub ()
274274+ | None, Some sys -> Dom.create_doctype ~name ~system_id:sys ()
275275+ | None, None -> Dom.create_doctype ~name ()
276276+ in
277277+ { token = Some (DoctypeNode node);
278278+ node = Some node;
279279+ tag_name = None;
280280+ is_empty_tag = false }
106281107107- | _ -> None
282282+ | _ -> { token = None; node = None; tag_name = None; is_empty_tag = false }
108283109109-(* Serialize a single node to HTML (simplified, matches test expectations) *)
284284+(* Serialize a single node to HTML with options *)
110285let escape_text text =
111286 let buf = Buffer.create (String.length text) in
112287 String.iter (fun c ->
···129304 ) value;
130305 !valid
131306132132-let choose_quote value =
133133- if String.contains value '"' && not (String.contains value '\'') then '\''
134134- else '"'
135135-136136-let escape_attr_value value quote_char =
307307+let escape_attr_value value quote_char escape_lt =
137308 let buf = Buffer.create (String.length value) in
138309 String.iter (fun c ->
139310 match c with
140311 | '&' -> Buffer.add_string buf "&"
141312 | '"' when quote_char = '"' -> Buffer.add_string buf """
313313+ | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
314314+ | '<' when escape_lt -> Buffer.add_string buf "<"
142315 | c -> Buffer.add_char buf c
143316 ) value;
144317 Buffer.contents buf
145318146146-let serialize_node node =
319319+let serialize_node opts ~in_raw_text node =
147320 match node.Dom.name with
148321 | "#text" ->
149149- (* Check if parent is a raw text element *)
150150- escape_text node.Dom.data
322322+ if in_raw_text && not opts.escape_rcdata then
323323+ node.Dom.data
324324+ else
325325+ escape_text node.Dom.data
151326 | "#comment" ->
152327 "<!--" ^ node.Dom.data ^ "-->"
153328 | "!doctype" ->
···177352 | None -> Buffer.add_string buf "html");
178353 Buffer.add_char buf '>';
179354 Buffer.contents buf
180180- | _ ->
181181- (* Element *)
182182- let buf = Buffer.create 64 in
183183- Buffer.add_char buf '<';
184184- Buffer.add_string buf node.Dom.name;
185185- List.iter (fun (key, value) ->
186186- Buffer.add_char buf ' ';
187187- Buffer.add_string buf key;
188188- if can_unquote_attr_value value then begin
189189- Buffer.add_char buf '=';
190190- Buffer.add_string buf value
191191- end else begin
192192- let quote = choose_quote value in
193193- Buffer.add_char buf '=';
194194- Buffer.add_char buf quote;
195195- Buffer.add_string buf (escape_attr_value value quote);
196196- Buffer.add_char buf quote
197197- end
198198- ) node.Dom.attrs;
199199- Buffer.add_char buf '>';
200200- Buffer.contents buf
355355+ | _ -> failwith "serialize_node called with element"
356356+357357+let choose_quote value default_quote explicit =
358358+ (* If quote_char was explicitly set, always use it *)
359359+ if explicit then default_quote
360360+ else
361361+ (* Otherwise, if value contains the default quote but not the other, use the other *)
362362+ let has_double = String.contains value '"' in
363363+ let has_single = String.contains value '\'' in
364364+ if has_double && not has_single then '\''
365365+ else if has_single && not has_double then '"'
366366+ else default_quote
367367+368368+(* Serialize an element tag (start tag) *)
369369+let serialize_start_tag opts ~is_empty_tag name attrs =
370370+ let buf = Buffer.create 64 in
371371+ Buffer.add_char buf '<';
372372+ Buffer.add_string buf name;
373373+ (* Sort attributes alphabetically for consistent output *)
374374+ let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in
375375+ List.iter (fun (key, value) ->
376376+ Buffer.add_char buf ' ';
377377+ Buffer.add_string buf key;
378378+ let should_minimize =
379379+ opts.minimize_boolean_attributes &&
380380+ String.lowercase_ascii key = String.lowercase_ascii value
381381+ in
382382+ if should_minimize then
383383+ ()
384384+ else if String.length value = 0 then begin
385385+ Buffer.add_char buf '=';
386386+ Buffer.add_char buf opts.quote_char;
387387+ Buffer.add_char buf opts.quote_char
388388+ end else if can_unquote_attr_value value then begin
389389+ Buffer.add_char buf '=';
390390+ Buffer.add_string buf value
391391+ end else begin
392392+ let quote = choose_quote value opts.quote_char opts.quote_char_explicit in
393393+ Buffer.add_char buf '=';
394394+ Buffer.add_char buf quote;
395395+ Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs);
396396+ Buffer.add_char buf quote
397397+ end
398398+ ) sorted_attrs;
399399+ if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then
400400+ Buffer.add_string buf " /";
401401+ Buffer.add_char buf '>';
402402+ Buffer.contents buf
403403+404404+(* Check if text starts with ASCII whitespace *)
405405+let text_starts_with_space text =
406406+ String.length text > 0 &&
407407+ let c = text.[0] in
408408+ c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' '
409409+410410+(* Optional tag omission helpers *)
411411+type next_token =
412412+ | NTComment
413413+ | NTSpace (* Text starting with space *)
414414+ | NTText (* Text not starting with space *)
415415+ | NTStartTag of string
416416+ | NTEmptyTag of string
417417+ | NTEndTag of string
418418+ | NTEOF
419419+420420+let classify_next tokens idx =
421421+ if idx >= Array.length tokens then NTEOF
422422+ else match tokens.(idx).token with
423423+ | None -> NTEOF
424424+ | Some (CommentNode _) -> NTComment
425425+ | Some (TextNode text) ->
426426+ if text_starts_with_space text then NTSpace else NTText
427427+ | Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name)
428428+ | Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name)
429429+ | Some (EndTag name) -> NTEndTag (String.lowercase_ascii name)
430430+ | Some (DoctypeNode _) -> NTEOF (* Treat doctype as if nothing follows *)
431431+432432+(* Should we omit a start tag? *)
433433+let should_omit_start_tag opts name attrs next =
434434+ if not opts.omit_optional_tags then false
435435+ else
436436+ let name = String.lowercase_ascii name in
437437+ match name, next with
438438+ (* html start tag: omit if not followed by comment or space, AND has no attributes *)
439439+ | "html", NTComment -> false
440440+ | "html", NTSpace -> false
441441+ | "html", _ -> attrs = [] (* only omit if no attributes *)
442442+ (* head start tag: omit if followed by element (start/empty tag) *)
443443+ | "head", NTStartTag _ -> true
444444+ | "head", NTEmptyTag _ -> true
445445+ | "head", NTEndTag "head" -> true (* empty head *)
446446+ | "head", NTEOF -> true
447447+ | "head", _ -> false
448448+ (* body start tag: omit if not followed by comment or space, AND has no attributes *)
449449+ | "body", NTComment -> false
450450+ | "body", NTSpace -> false
451451+ | "body", _ -> attrs = [] (* only omit if no attributes *)
452452+ (* colgroup start tag: omit if followed by col element *)
453453+ | "colgroup", NTStartTag "col" -> true
454454+ | "colgroup", NTEmptyTag "col" -> true
455455+ | "colgroup", _ -> false
456456+ (* tbody start tag: omit if first child is tr *)
457457+ | "tbody", NTStartTag "tr" -> true
458458+ | "tbody", _ -> false
459459+ | _ -> false
460460+461461+(* Should we omit an end tag? *)
462462+let should_omit_end_tag opts name next =
463463+ if not opts.omit_optional_tags then false
464464+ else
465465+ let name = String.lowercase_ascii name in
466466+ match name, next with
467467+ (* html end tag: omit if not followed by comment or space *)
468468+ | "html", NTComment -> false
469469+ | "html", NTSpace -> false
470470+ | "html", _ -> true
471471+ (* head end tag: omit if not followed by comment or space *)
472472+ | "head", NTComment -> false
473473+ | "head", NTSpace -> false
474474+ | "head", _ -> true
475475+ (* body end tag: omit if not followed by comment or space *)
476476+ | "body", NTComment -> false
477477+ | "body", NTSpace -> false
478478+ | "body", _ -> true
479479+ (* li end tag: omit if followed by li start tag or parent end tag *)
480480+ | "li", NTStartTag "li" -> true
481481+ | "li", NTEndTag _ -> true
482482+ | "li", NTEOF -> true
483483+ | "li", _ -> false
484484+ (* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *)
485485+ | "dt", NTStartTag "dt" -> true
486486+ | "dt", NTStartTag "dd" -> true
487487+ | "dt", _ -> false
488488+ (* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *)
489489+ | "dd", NTStartTag "dd" -> true
490490+ | "dd", NTStartTag "dt" -> true
491491+ | "dd", NTEndTag _ -> true
492492+ | "dd", NTEOF -> true
493493+ | "dd", _ -> false
494494+ (* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *)
495495+ | "p", NTStartTag next_name when is_p_closing_element next_name -> true
496496+ | "p", NTEmptyTag next_name when is_p_closing_element next_name -> true
497497+ | "p", NTEndTag _ -> true
498498+ | "p", NTEOF -> true
499499+ | "p", _ -> false
500500+ (* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *)
501501+ | "optgroup", NTStartTag "optgroup" -> true
502502+ | "optgroup", NTEndTag _ -> true
503503+ | "optgroup", NTEOF -> true
504504+ | "optgroup", _ -> false
505505+ (* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *)
506506+ | "option", NTStartTag "option" -> true
507507+ | "option", NTStartTag "optgroup" -> true
508508+ | "option", NTEndTag _ -> true
509509+ | "option", NTEOF -> true
510510+ | "option", _ -> false
511511+ (* colgroup end tag: omit if not followed by comment, space, or another colgroup *)
512512+ | "colgroup", NTComment -> false
513513+ | "colgroup", NTSpace -> false
514514+ | "colgroup", NTStartTag "colgroup" -> false (* keep end tag when another colgroup follows *)
515515+ | "colgroup", _ -> true
516516+ (* thead end tag: omit if followed by tbody or tfoot start tag *)
517517+ | "thead", NTStartTag "tbody" -> true
518518+ | "thead", NTStartTag "tfoot" -> true
519519+ | "thead", _ -> false
520520+ (* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *)
521521+ | "tbody", NTStartTag "tbody" -> true
522522+ | "tbody", NTStartTag "tfoot" -> true
523523+ | "tbody", NTEndTag _ -> true
524524+ | "tbody", NTEOF -> true
525525+ | "tbody", _ -> false
526526+ (* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *)
527527+ | "tfoot", NTStartTag "tbody" -> true
528528+ | "tfoot", NTEndTag _ -> true
529529+ | "tfoot", NTEOF -> true
530530+ | "tfoot", _ -> false
531531+ (* tr end tag: omit if followed by tr start tag, end tag, or EOF *)
532532+ | "tr", NTStartTag "tr" -> true
533533+ | "tr", NTEndTag _ -> true
534534+ | "tr", NTEOF -> true
535535+ | "tr", _ -> false
536536+ (* td end tag: omit if followed by td/th start tag, end tag, or EOF *)
537537+ | "td", NTStartTag "td" -> true
538538+ | "td", NTStartTag "th" -> true
539539+ | "td", NTEndTag _ -> true
540540+ | "td", NTEOF -> true
541541+ | "td", _ -> false
542542+ (* th end tag: omit if followed by th/td start tag, end tag, or EOF *)
543543+ | "th", NTStartTag "th" -> true
544544+ | "th", NTStartTag "td" -> true
545545+ | "th", NTEndTag _ -> true
546546+ | "th", NTEOF -> true
547547+ | "th", _ -> false
548548+ | _ -> false
201549202550(* Run a single test *)
203551let run_test test =
204552 try
205205- (* Build nodes from input tokens *)
206206- let nodes = List.filter_map build_node_from_token test.input in
553553+ (* Build token infos from input *)
554554+ let token_infos = Array.of_list (List.map build_token_info test.input) in
555555+ let num_tokens = Array.length token_infos in
556556+557557+ (* Handle inject_meta_charset option *)
558558+ let inject_meta = test.options.inject_meta_charset in
559559+ let encoding = test.options.encoding in
560560+561561+ (* Serialize with context tracking *)
562562+ let buf = Buffer.create 256 in
563563+ let in_raw_text = ref false in
564564+ let preserve_whitespace = ref false in
565565+ let element_stack : string list ref = ref [] in
566566+ let in_head = ref false in
567567+ let meta_charset_injected = ref false in
568568+ let prev_was_section_end = ref false in (* Track if prev token was thead/tbody/tfoot end *)
207569208208- (* Serialize *)
209209- let serialized = String.concat "" (List.map serialize_node nodes) in
570570+ for i = 0 to num_tokens - 1 do
571571+ let info = token_infos.(i) in
572572+ let next = classify_next token_infos (i + 1) in
573573+574574+ match info.token with
575575+ | None -> ()
576576+577577+ | Some (StartTag (name, attrs)) ->
578578+ let name_lower = String.lowercase_ascii name in
579579+580580+ (* Track head element *)
581581+ if name_lower = "head" then in_head := true;
582582+583583+ (* For inject_meta_charset, we need to check if there's any charset meta coming up *)
584584+ (* If yes, don't inject at head start; if no, inject at head start *)
585585+ let should_inject_at_head =
586586+ if not inject_meta || name_lower <> "head" then false
587587+ else match encoding with
588588+ | None -> false
589589+ | Some _ ->
590590+ (* Look ahead to see if there's a charset meta or http-equiv content-type *)
591591+ let has_charset_meta = ref false in
592592+ for j = i + 1 to num_tokens - 1 do
593593+ match token_infos.(j).token with
594594+ | Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" ->
595595+ let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in
596596+ let has_http_equiv_ct = List.exists (fun (k, v) ->
597597+ String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in
598598+ if has_charset || has_http_equiv_ct then has_charset_meta := true
599599+ | Some (EndTag n) when String.lowercase_ascii n = "head" -> ()
600600+ | _ -> ()
601601+ done;
602602+ not !has_charset_meta
603603+ in
604604+605605+ (* Special case: tbody start tag cannot be omitted if preceded by section end tag *)
606606+ let can_omit_start =
607607+ if name_lower = "tbody" && !prev_was_section_end then false
608608+ else should_omit_start_tag test.options name attrs next
609609+ in
610610+ prev_was_section_end := false; (* Reset for next iteration *)
611611+612612+ if should_inject_at_head then begin
613613+ meta_charset_injected := true;
614614+ (* Don't output head start tag if we should omit it *)
615615+ if not can_omit_start then
616616+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
617617+ Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding));
618618+ element_stack := name :: !element_stack;
619619+ if is_raw_text_element name then in_raw_text := true;
620620+ if is_whitespace_preserving_element name then preserve_whitespace := true
621621+ end else if not can_omit_start then begin
622622+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
623623+ element_stack := name :: !element_stack;
624624+ if is_raw_text_element name then in_raw_text := true;
625625+ if is_whitespace_preserving_element name then preserve_whitespace := true
626626+ end else begin
627627+ element_stack := name :: !element_stack;
628628+ if is_raw_text_element name then in_raw_text := true;
629629+ if is_whitespace_preserving_element name then preserve_whitespace := true
630630+ end
631631+632632+ | Some (EmptyTag (name, attrs)) ->
633633+ let name_lower = String.lowercase_ascii name in
634634+ prev_was_section_end := false; (* Reset for next iteration *)
635635+636636+ (* Handle meta charset replacement *)
637637+ if inject_meta && !in_head && name_lower = "meta" then begin
638638+ let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in
639639+ let has_http_equiv_ct =
640640+ List.exists (fun (k, v) ->
641641+ String.lowercase_ascii k = "http-equiv" &&
642642+ String.lowercase_ascii v = "content-type"
643643+ ) attrs
644644+ in
645645+ if has_charset then begin
646646+ (* Replace charset value *)
647647+ match encoding with
648648+ | Some enc ->
649649+ Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc)
650650+ | None -> () (* No encoding, skip the meta tag *)
651651+ end else if has_http_equiv_ct then begin
652652+ (* Replace charset in content value *)
653653+ match encoding with
654654+ | Some enc ->
655655+ let new_attrs = List.map (fun (k, v) ->
656656+ if String.lowercase_ascii k = "content" then
657657+ let new_content = Printf.sprintf "text/html; charset=%s" enc in
658658+ (k, new_content)
659659+ else (k, v)
660660+ ) attrs in
661661+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs)
662662+ | None ->
663663+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
664664+ end else begin
665665+ (* Regular meta tag, output as normal *)
666666+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
667667+ end
668668+ end else
669669+ Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
670670+671671+ | Some (EndTag name) ->
672672+ let name_lower = String.lowercase_ascii name in
673673+674674+ (* Track head element *)
675675+ if name_lower = "head" then in_head := false;
676676+677677+ (* Pop from element stack *)
678678+ (match !element_stack with
679679+ | top :: rest when String.lowercase_ascii top = name_lower ->
680680+ element_stack := rest;
681681+ if is_raw_text_element name then in_raw_text := false;
682682+ if is_whitespace_preserving_element name then preserve_whitespace := false
683683+ | _ -> ());
684684+685685+ let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in
686686+ let omit = should_omit_end_tag test.options name next in
687687+688688+ if not omit then begin
689689+ Buffer.add_string buf "</";
690690+ Buffer.add_string buf name;
691691+ Buffer.add_char buf '>'
692692+ end;
693693+694694+ (* Track if we omitted a section end tag - next tbody can't be omitted *)
695695+ prev_was_section_end := is_section_end && omit
696696+697697+ | Some (TextNode text) ->
698698+ prev_was_section_end := false;
699699+ let processed_text =
700700+ if !in_raw_text && not test.options.escape_rcdata then
701701+ text
702702+ else if test.options.strip_whitespace && not !preserve_whitespace then
703703+ escape_text (collapse_whitespace text)
704704+ else
705705+ escape_text text
706706+ in
707707+ Buffer.add_string buf processed_text
708708+709709+ | Some (CommentNode text) ->
710710+ prev_was_section_end := false;
711711+ Buffer.add_string buf "<!--";
712712+ Buffer.add_string buf text;
713713+ Buffer.add_string buf "-->"
714714+715715+ | Some (DoctypeNode node) ->
716716+ prev_was_section_end := false;
717717+ Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node)
718718+ done;
719719+720720+ let serialized = Buffer.contents buf in
210721211722 (* Check if it matches any expected output *)
212723 let matches = List.exists (fun exp -> serialized = exp) test.expected in
+18-17
test/test_tokenizer.ml
···3636 initial_states : string list;
3737 last_start_tag : string option;
3838 double_escaped : bool;
3939+ xml_mode : bool;
3940}
40414142(* Unescape double-escaped strings from tests *)
···118119 }
119120120121(* Parse a single test case from JSON *)
121121-let parse_test_case json =
122122+let parse_test_case ~xml_mode json =
122123 let obj = json_object json in
123124 let description = json_string (json_mem_exn "description" obj) in
124125 let input = json_string (json_mem_exn "input" obj) in
···139140 | Some b -> json_bool b
140141 | None -> false
141142 in
142142- { description; input; output; errors; initial_states; last_start_tag; double_escaped }
143143+ { description; input; output; errors; initial_states; last_start_tag; double_escaped; xml_mode }
143144144145(* Convert state name to State.t *)
145146let state_of_string = function
···222223 let input = if test.double_escaped then unescape_double test.input else test.input in
223224224225 let collector = TokenCollector.create () in
225225- let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true () in
226226+ let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
226227227228 (* Set initial state *)
228229 Tokenizer.set_state tokenizer initial_state;
···305306306307 let obj = json_object json in
307308308308- (* Handle both {"tests": [...]} and {"xmlViolationTests": [...], "tests": [...]} formats *)
309309- let test_arrays =
310310- let tests = match json_mem "tests" obj with
311311- | Some t -> json_array t
312312- | None -> []
313313- in
314314- let xml_tests = match json_mem "xmlViolationTests" obj with
315315- | Some t -> json_array t
316316- | None -> []
317317- in
318318- tests @ xml_tests
309309+ (* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *)
310310+ let regular_tests =
311311+ match json_mem "tests" obj with
312312+ | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t)
313313+ | None -> []
314314+ in
315315+ let xml_tests =
316316+ match json_mem "xmlViolationTests" obj with
317317+ | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t)
318318+ | None -> []
319319 in
320320+ let all_tests = regular_tests @ xml_tests in
320321321322 let filename = Filename.basename path in
322323 let passed = ref 0 in
323324 let failed = ref 0 in
324325 let first_failures = ref [] in
325326326326- List.iteri (fun i test_json ->
327327- let test = parse_test_case test_json in
327327+ List.iteri (fun i test ->
328328+ (* test is already parsed *)
328329329330 (* Run for each initial state *)
330331 List.iter (fun state_name ->
···345346 first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures;
346347 Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e)
347348 ) test.initial_states
348348- ) test_arrays;
349349+ ) all_tests;
349350350351 (!passed, !failed, List.rev !first_failures, filename)
351352