OCaml HTML5 parser/serialiser based on Python's JustHTML

fixes

+25 -77
+12 -45
test/test_serializer.ml
··· 180 180 181 181 type token_info = { 182 182 token : token_type option; 183 - node : Dom.node option; (* Legacy for compatibility *) 184 - tag_name : string option; 185 - is_empty_tag : bool; 186 183 } 187 184 188 185 let build_token_info token = 189 186 let arr = json_array token in 190 187 match arr with 191 - | [] -> { token = None; node = None; tag_name = None; is_empty_tag = false } 188 + | [] -> { token = None } 192 189 | type_json :: rest -> 193 190 let token_type_str = json_string type_json in 194 191 match token_type_str, rest with 195 192 | "StartTag", [_ns_json; name_json; attrs_json] -> 196 193 let name = json_string name_json in 197 194 let attrs = parse_attrs attrs_json in 198 - { token = Some (StartTag (name, attrs)); 199 - node = Some (Dom.create_element name ~attrs ()); 200 - tag_name = Some name; 201 - is_empty_tag = false } 195 + { token = Some (StartTag (name, attrs)) } 202 196 203 197 | "StartTag", [name_json; attrs_json] -> 204 198 let name = json_string name_json in 205 199 let attrs = parse_attrs attrs_json in 206 - { token = Some (StartTag (name, attrs)); 207 - node = Some (Dom.create_element name ~attrs ()); 208 - tag_name = Some name; 209 - is_empty_tag = false } 200 + { token = Some (StartTag (name, attrs)) } 210 201 211 202 | "EmptyTag", [name_json; attrs_json] -> 212 203 let name = json_string name_json in 213 204 let attrs = parse_attrs attrs_json in 214 - { token = Some (EmptyTag (name, attrs)); 215 - node = Some (Dom.create_element name ~attrs ()); 216 - tag_name = Some name; 217 - is_empty_tag = true } 205 + { token = Some (EmptyTag (name, attrs)) } 218 206 219 207 | "EndTag", [_ns_json; name_json] -> 220 208 let name = json_string name_json in 221 - { token = Some (EndTag name); 222 - node = None; 223 - tag_name = Some name; 224 - is_empty_tag = false } 209 + { token = Some (EndTag name) } 225 210 226 211 | "EndTag", [name_json] -> 227 212 let name = json_string name_json in 228 - { token = Some (EndTag name); 229 - node = None; 230 - tag_name = Some name; 231 - is_empty_tag = false } 213 + { token = Some (EndTag name) } 232 214 233 215 | "Characters", [text_json] -> 234 216 let text = json_string text_json in 235 - { token = Some (TextNode text); 236 - node = Some (Dom.create_text text); 237 - tag_name = None; 238 - is_empty_tag = false } 217 + { token = Some (TextNode text) } 239 218 240 219 | "Comment", [text_json] -> 241 220 let text = json_string text_json in 242 - { token = Some (CommentNode text); 243 - node = Some (Dom.create_comment text); 244 - tag_name = None; 245 - is_empty_tag = false } 221 + { token = Some (CommentNode text) } 246 222 247 223 | "Doctype", [name_json] -> 248 224 let name = json_string name_json in 249 225 let node = Dom.create_doctype ~name () in 250 - { token = Some (DoctypeNode node); 251 - node = Some node; 252 - tag_name = None; 253 - is_empty_tag = false } 226 + { token = Some (DoctypeNode node) } 254 227 255 228 | "Doctype", [name_json; public_json] -> 256 229 let name = json_string name_json in ··· 259 232 | Some pub -> Dom.create_doctype ~name ~public_id:pub () 260 233 | None -> Dom.create_doctype ~name () 261 234 in 262 - { token = Some (DoctypeNode node); 263 - node = Some node; 264 - tag_name = None; 265 - is_empty_tag = false } 235 + { token = Some (DoctypeNode node) } 266 236 267 237 | "Doctype", [name_json; public_json; system_json] -> 268 238 let name = json_string name_json in ··· 274 244 | None, Some sys -> Dom.create_doctype ~name ~system_id:sys () 275 245 | None, None -> Dom.create_doctype ~name () 276 246 in 277 - { token = Some (DoctypeNode node); 278 - node = Some node; 279 - tag_name = None; 280 - is_empty_tag = false } 247 + { token = Some (DoctypeNode node) } 281 248 282 - | _ -> { token = None; node = None; tag_name = None; is_empty_tag = false } 249 + | _ -> { token = None } 283 250 284 251 (* Serialize a single node to HTML with options *) 285 252 let escape_text text =
+13 -32
test/test_tokenizer.ml
··· 22 22 end 23 23 24 24 (* Test case representation *) 25 - type test_error = { 26 - code : string; 27 - line : int; 28 - col : int; 29 - } 30 - 31 25 type test_case = { 32 26 description : string; 33 27 input : string; 34 28 output : Jsont.json list; 35 - errors : test_error list; 29 + expected_error_count : int; 36 30 initial_states : string list; 37 31 last_start_tag : string option; 38 32 double_escaped : bool; ··· 87 81 | Jsont.Bool (b, _) -> b 88 82 | _ -> failwith "Expected bool" 89 83 90 - let json_int = function 91 - | Jsont.Number (n, _) -> int_of_float n 92 - | _ -> failwith "Expected number" 93 - 94 84 let json_array = function 95 85 | Jsont.Array (arr, _) -> arr 96 86 | _ -> failwith "Expected array" ··· 109 99 | Some v -> v 110 100 | None -> failwith ("Missing member: " ^ name) 111 101 112 - (* Parse test error from JSON *) 113 - let parse_test_error json = 114 - let obj = json_object json in 115 - { 116 - code = json_string (json_mem_exn "code" obj); 117 - line = json_int (json_mem_exn "line" obj); 118 - col = json_int (json_mem_exn "col" obj); 119 - } 120 - 121 102 (* Parse a single test case from JSON *) 122 103 let parse_test_case ~xml_mode json = 123 104 let obj = json_object json in 124 105 let description = json_string (json_mem_exn "description" obj) in 125 106 let input = json_string (json_mem_exn "input" obj) in 126 107 let output = json_array (json_mem_exn "output" obj) in 127 - let errors = match json_mem "errors" obj with 128 - | Some e -> List.map parse_test_error (json_array e) 129 - | None -> [] 108 + let expected_error_count = match json_mem "errors" obj with 109 + | Some e -> List.length (json_array e) 110 + | None -> 0 130 111 in 131 112 let initial_states = match json_mem "initialStates" obj with 132 113 | Some s -> List.map json_string (json_array s) ··· 140 121 | Some b -> json_bool b 141 122 | None -> false 142 123 in 143 - { description; input; output; errors; initial_states; last_start_tag; double_escaped; xml_mode } 124 + { description; input; output; expected_error_count; initial_states; last_start_tag; double_escaped; xml_mode } 144 125 145 126 (* Convert state name to State.t *) 146 127 let state_of_string = function ··· 272 253 List.for_all2 json_equal actual_tokens expected 273 254 in 274 255 275 - let actual_errors = Tokenizer.get_errors tokenizer in 276 - let errors_count_match = List.length actual_errors = List.length test.errors in 256 + let actual_error_count = List.length (Tokenizer.get_errors tokenizer) in 257 + let errors_count_match = actual_error_count = test.expected_error_count in 277 258 278 - (tokens_match && errors_count_match, actual_tokens, expected, actual_errors, test.errors) 259 + (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count) 279 260 280 261 (* Format JSON for display *) 281 262 let rec json_to_string = function ··· 331 312 List.iter (fun state_name -> 332 313 try 333 314 let state = state_of_string state_name in 334 - let (success, actual, expected, actual_errors, expected_errors) = run_test test state in 315 + let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in 335 316 336 317 if success then 337 318 incr passed 338 319 else begin 339 320 incr failed; 340 321 if List.length !first_failures < 3 then 341 - first_failures := (i + 1, test.description, state_name, actual, expected, actual_errors, expected_errors) :: !first_failures 322 + first_failures := (i + 1, test.description, state_name, actual, expected, actual_err_count, expected_err_count) :: !first_failures 342 323 end 343 324 with e -> 344 325 incr failed; 345 326 if List.length !first_failures < 3 then 346 - first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures; 327 + first_failures := (i + 1, test.description, state_name, [], [], 0, 0) :: !first_failures; 347 328 Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e) 348 329 ) test.initial_states 349 330 ) all_tests; ··· 375 356 if !all_failures <> [] then begin 376 357 Printf.printf "\n=== First failures ===\n"; 377 358 List.iter (fun (filename, failures) -> 378 - List.iter (fun (test_num, desc, state, actual, expected, actual_errs, expected_errs) -> 359 + List.iter (fun (test_num, desc, state, actual, expected, actual_err_count, expected_err_count) -> 379 360 Printf.printf "\n--- %s test %d (%s) in %s ---\n" filename test_num state desc; 380 361 Printf.printf "Expected tokens: [%s]\n" (String.concat "; " (List.map json_to_string expected)); 381 362 Printf.printf "Actual tokens: [%s]\n" (String.concat "; " (List.map json_to_string actual)); 382 - Printf.printf "Expected %d errors, got %d\n" (List.length expected_errs) (List.length actual_errs) 363 + Printf.printf "Expected %d errors, got %d\n" expected_err_count actual_err_count 383 364 ) failures 384 365 ) (List.rev !all_failures) 385 366 end;