···2312231223132313and process_foreign_content t token =
23142314 match token with
23152315- | Token.Character "\x00" ->
23152315+ | Token.Character data when String.contains data '\x00' ->
23162316+ (* Replace NUL characters with U+FFFD replacement character *)
23162317 parse_error t "unexpected-null-character";
23172317- insert_character t "\xEF\xBF\xBD"
23182318+ let buf = Buffer.create (String.length data) in
23192319+ let has_non_ws_non_nul = ref false in
23202320+ String.iter (fun c ->
23212321+ if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD"
23222322+ else begin
23232323+ Buffer.add_char buf c;
23242324+ if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then
23252325+ has_non_ws_non_nul := true
23262326+ end
23272327+ ) data;
23282328+ let replaced = Buffer.contents buf in
23292329+ insert_character t replaced;
23302330+ (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *)
23312331+ if !has_non_ws_non_nul then t.frameset_ok <- false
23182332 | Token.Character data when is_whitespace data ->
23192333 insert_character t data
23202334 | Token.Character data ->
+13-3
lib/tokenizer/stream.ml
···2020 mutable column : int;
2121 (* Track if we just saw CR (for CR/LF normalization) *)
2222 mutable last_was_cr : bool;
2323+ (* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
2424+ mutable skip_next_lf : bool;
2325}
24262527(* Create a stream from a Bytes.Reader.t *)
···3335 line = 1;
3436 column = 0;
3537 last_was_cr = false;
3838+ skip_next_lf = false;
3639 }
37403841(* Create a stream from a string - discouraged, prefer create_from_reader *)
···8386 None
8487 | Some '\r' ->
8588 t.last_was_cr <- true;
8686- Some '\n' (* CR becomes LF *)
8989+ (* Immediately consume following LF if present (CRLF -> single LF) *)
9090+ (match read_raw_char t with
9191+ | Some '\n' -> () (* Consume the LF that follows CR *)
9292+ | Some c -> push_back_char t c (* Put non-LF char back *)
9393+ | None -> ());
9494+ Some '\n' (* CR (or CRLF) becomes single LF *)
8795 | Some '\n' when t.last_was_cr ->
8896 (* Skip LF after CR - it was already converted *)
8997 t.last_was_cr <- false;
···102110 Bytes.Slice.is_eod next)))
103111104112let peek t =
113113+ (* Save last_was_cr state before reading *)
114114+ let saved_last_was_cr = t.last_was_cr in
105115 match read_normalized_char t with
106116 | None -> None
107117 | Some c ->
108118 push_back_char t c;
109109- (* Undo last_was_cr if we pushed back a CR-converted LF *)
110110- if c = '\n' then t.last_was_cr <- false;
119119+ (* Restore the last_was_cr state so advance handles CR/LF correctly *)
120120+ t.last_was_cr <- saved_last_was_cr;
111121 Some c
112122113123(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
+62-23
lib/tokenizer/tokenizer.ml
···150150 let emit_current_tag () =
151151 finish_attribute t;
152152 let name = Buffer.contents t.current_tag_name in
153153+ let attrs = List.rev t.current_attrs in
154154+ (* Check for end tag with attributes or self-closing flag *)
155155+ if t.current_tag_kind = Token.End then begin
156156+ if attrs <> [] then
157157+ error t "end-tag-with-attributes";
158158+ if t.current_tag_self_closing then
159159+ error t "end-tag-with-trailing-solidus"
160160+ end;
153161 let tag = {
154162 Token.kind = t.current_tag_kind;
155163 name;
156156- attrs = List.rev t.current_attrs;
164164+ attrs;
157165 self_closing = t.current_tag_self_closing;
158166 } in
159167 if t.current_tag_kind = Token.Start then
···173181174182 let emit_current_comment () =
175183 emit (Token.Comment (Buffer.contents t.current_comment))
184184+ in
185185+186186+ (* Check for control characters and emit error if needed *)
187187+ let check_control_char c =
188188+ let code = Char.code c in
189189+ (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *)
190190+ (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
191191+ if (code >= 0x01 && code <= 0x08) ||
192192+ code = 0x0B ||
193193+ (code >= 0x0E && code <= 0x1F) ||
194194+ (code >= 0x7F && code <= 0x9F) then
195195+ error t "control-character-in-input-stream"
196196+ in
197197+198198+ (* Emit char with control character check *)
199199+ let emit_char_checked c =
200200+ check_control_char c;
201201+ emit_char t c
176202 in
177203178204 let rec process_state () =
···374400 flush_code_points_consumed_as_char_ref t;
375401 t.state <- t.return_state;
376402 handle_eof ()
377377- | State.Named_character_reference
378378- | State.Numeric_character_reference
403403+ | State.Named_character_reference ->
404404+ flush_code_points_consumed_as_char_ref t;
405405+ t.state <- t.return_state;
406406+ handle_eof ()
407407+ | State.Numeric_character_reference ->
408408+ (* At EOF with just "&#" - no digits follow *)
409409+ error t "absence-of-digits-in-numeric-character-reference";
410410+ flush_code_points_consumed_as_char_ref t;
411411+ t.state <- t.return_state;
412412+ handle_eof ()
379413 | State.Hexadecimal_character_reference_start
380380- | State.Decimal_character_reference_start
381381- | State.Numeric_character_reference_end ->
414414+ | State.Decimal_character_reference_start ->
415415+ error t "absence-of-digits-in-numeric-character-reference";
382416 flush_code_points_consumed_as_char_ref t;
383417 t.state <- t.return_state;
418418+ handle_eof ()
419419+ | State.Numeric_character_reference_end ->
420420+ (* We have collected digits, just need to finalize the character reference *)
421421+ step ();
384422 handle_eof ()
385423 | State.Ambiguous_ampersand ->
386424 (* Buffer was already flushed when entering this state, just transition *)
···508546 error t "unexpected-null-character";
509547 ignore (S.process t.sink (Token.Character "\x00"))
510548 | Some c ->
511511- emit_char t c
549549+ emit_char_checked c
512550 | None -> ()
513551514552 and state_rcdata () =
···522560 error t "unexpected-null-character";
523561 emit_str t "\xEF\xBF\xBD"
524562 | Some c ->
525525- emit_char t c
563563+ emit_char_checked c
526564 | None -> ()
527565528566 and state_rawtext () =
···533571 error t "unexpected-null-character";
534572 emit_str t "\xEF\xBF\xBD"
535573 | Some c ->
536536- emit_char t c
574574+ emit_char_checked c
537575 | None -> ()
538576539577 and state_script_data () =
···544582 error t "unexpected-null-character";
545583 emit_str t "\xEF\xBF\xBD"
546584 | Some c ->
547547- emit_char t c
585585+ emit_char_checked c
548586 | None -> ()
549587550588 and state_plaintext () =
···553591 error t "unexpected-null-character";
554592 emit_str t "\xEF\xBF\xBD"
555593 | Some c ->
556556- emit_char t c
594594+ emit_char_checked c
557595 | None -> ()
558596559597 and state_tag_open () =
···765803 error t "unexpected-null-character";
766804 emit_str t "\xEF\xBF\xBD"
767805 | Some c ->
768768- emit_char t c
806806+ emit_char_checked c
769807 | None -> ()
770808771809 and state_script_data_escaped_dash () =
···781819 emit_str t "\xEF\xBF\xBD"
782820 | Some c ->
783821 t.state <- State.Script_data_escaped;
784784- emit_char t c
822822+ emit_char_checked c
785823 | None -> ()
786824787825 and state_script_data_escaped_dash_dash () =
···799837 emit_str t "\xEF\xBF\xBD"
800838 | Some c ->
801839 t.state <- State.Script_data_escaped;
802802- emit_char t c
840840+ emit_char_checked c
803841 | None -> ()
804842805843 and state_script_data_escaped_less_than_sign () =
···875913 error t "unexpected-null-character";
876914 emit_str t "\xEF\xBF\xBD"
877915 | Some c ->
878878- emit_char t c
916916+ emit_char_checked c
879917 | None -> ()
880918881919 and state_script_data_double_escaped_dash () =
···892930 emit_str t "\xEF\xBF\xBD"
893931 | Some c ->
894932 t.state <- State.Script_data_double_escaped;
895895- emit_char t c
933933+ emit_char_checked c
896934 | None -> ()
897935898936 and state_script_data_double_escaped_dash_dash () =
···911949 emit_str t "\xEF\xBF\xBD"
912950 | Some c ->
913951 t.state <- State.Script_data_double_escaped;
914914- emit_char t c
952952+ emit_char_checked c
915953 | None -> ()
916954917955 and state_script_data_double_escaped_less_than_sign () =
···15701608 match Stream.consume t.stream with
15711609 | Some ']' ->
15721610 t.state <- State.Cdata_section_bracket
15731573- | Some '\x00' ->
15741574- error t "unexpected-null-character";
15751575- emit_str t "\xEF\xBF\xBD"
15761611 | Some c ->
16121612+ (* CDATA section emits all characters as-is, including NUL *)
15771613 emit_char t c
15781614 | None -> ()
15791615···17031739 t.state <- t.return_state
17041740 end
17051741 | None ->
17061706- (* No match - check if we should report ambiguous ampersand *)
17421742+ (* No match - check if we should report unknown-named-character-reference *)
17071743 if String.length entity_name > 0 then begin
17081708- t.state <- State.Ambiguous_ampersand;
17091709- (* Reset position - we need to emit the ampersand and chars *)
17101710- flush_code_points_consumed_as_char_ref t
17441744+ (* If we have a semicolon, it's definitely an unknown named character reference *)
17451745+ if has_semicolon then
17461746+ error t "unknown-named-character-reference";
17471747+ (* Emit all the chars we consumed *)
17481748+ flush_code_points_consumed_as_char_ref t;
17491749+ t.state <- t.return_state
17111750 end else begin
17121751 flush_code_points_consumed_as_char_ref t;
17131752 t.state <- t.return_state