Pure OCaml Yaml 1.2 reader and writer using Bytesrw

Fix crash on invalid Unicode escape sequences in double-quoted strings

The decode_hex function didn't validate that Unicode codepoints from
\u and \U escapes are valid before encoding to UTF-8. Invalid codepoints
(> 0x10FFFF or surrogates 0xD800-0xDFFF) caused Char.chr to crash with
Invalid_argument instead of raising a proper Yamlrw_error.

Use Uchar.is_valid to validate codepoints, which correctly rejects both
out-of-range values and surrogate codepoints.

Found by AFL fuzzing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+41 -1
+3
lib/scanner.ml
··· 534 534 | _ -> Error.raise_at start (Invalid_hex_escape (Buffer.contents buf)) 535 535 done; 536 536 let code = int_of_string ("0x" ^ Buffer.contents buf) in 537 + (* Validate Unicode scalar value (0x0000-0x10FFFF, excluding surrogates) *) 538 + if not (Uchar.is_valid code) then 539 + Error.raise_at start (Invalid_unicode_escape (Buffer.contents buf)); 537 540 if code <= 0x7F then String.make 1 (Char.chr code) 538 541 else if code <= 0x7FF then 539 542 let b1 = 0xC0 lor (code lsr 6) in
+38 -1
tests/test_yamlrw.ml
··· 348 348 Alcotest.fail "expected error" 349 349 with Yamlrw_error e -> Alcotest.(check bool) "has span" true (e.span <> None) 350 350 351 - let error_tests = [ ("error position", `Quick, test_error_position) ] 351 + let test_invalid_unicode_escape () = 352 + (* Unicode scalar values must be 0x0000-0x10FFFF, excluding surrogates *) 353 + (* Test \U with value > 0x10FFFF (maximum valid Unicode codepoint) *) 354 + (try 355 + let _ = of_string "\"\\U88888888\"" in 356 + Alcotest.fail "expected Invalid_unicode_escape error for out-of-range" 357 + with Yamlrw_error e -> ( 358 + match e.Error.kind with 359 + | Error.Invalid_unicode_escape _ -> () 360 + | _ -> 361 + Alcotest.fail 362 + ("expected Invalid_unicode_escape error, got: " 363 + ^ Error.kind_to_string e.Error.kind))); 364 + (* Test \u with surrogate codepoint (should error) *) 365 + (try 366 + let _ = of_string "\"\\uD800\"" in 367 + Alcotest.fail "expected Invalid_unicode_escape error for surrogate" 368 + with Yamlrw_error e -> ( 369 + match e.Error.kind with 370 + | Error.Invalid_unicode_escape _ -> () 371 + | _ -> 372 + Alcotest.fail 373 + ("expected Invalid_unicode_escape error, got: " 374 + ^ Error.kind_to_string e.Error.kind))); 375 + (* Test \u with valid value (should work) *) 376 + let v = of_string "\"\\u0041\"" in 377 + Alcotest.(check string) "valid \\u escape" "A" (Value.to_string v); 378 + (* Test \U with valid value at max boundary (should work) *) 379 + let v2 = of_string "\"\\U0010FFFF\"" in 380 + Alcotest.(check bool) 381 + "valid \\U at max boundary" true 382 + (String.length (Value.to_string v2) > 0) 383 + 384 + let error_tests = 385 + [ 386 + ("error position", `Quick, test_error_position); 387 + ("invalid unicode escape", `Quick, test_invalid_unicode_escape); 388 + ] 352 389 353 390 (** Alias expansion limit tests (billion laughs protection) *) 354 391