OCaml implementation of the Mozilla Public Suffix service

merge

+366 -366
+30 -28
bin/main.ml
··· 6 6 open Cmdliner 7 7 8 8 let psl = lazy (Publicsuffix.create ()) 9 - 10 9 let psl () = Lazy.force psl 11 10 12 11 (* Helper functions for printing results *) 13 12 14 - let print_error e = 15 - Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 16 - 17 - let print_result = function 18 - | Ok s -> print_endline s 19 - | Error e -> print_error e 13 + let print_error e = Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 14 + let print_result = function Ok s -> print_endline s | Error e -> print_error e 20 15 21 16 let print_bool_result = function 22 17 | Ok b -> print_endline (string_of_bool b) ··· 24 19 25 20 let print_result_with_section = function 26 21 | Ok (s, sec) -> 27 - let sec_str = match sec with 22 + let sec_str = 23 + match sec with 28 24 | Publicsuffix.ICANN -> "ICANN" 29 25 | Publicsuffix.Private -> "PRIVATE" 30 26 in ··· 59 55 let doc = "Check if a domain is a registrable domain" in 60 56 let info = Cmd.info "is_registrable" ~doc in 61 57 let term = 62 - Term.(const print_bool_result $ Publicsuffix_cmd.is_registrable_term (psl ())) 58 + Term.( 59 + const print_bool_result $ Publicsuffix_cmd.is_registrable_term (psl ())) 63 60 in 64 61 Cmd.v info term 65 62 ··· 67 64 let doc = "Get the registrable domain with section information" in 68 65 let info = Cmd.info "registrable_section" ~doc in 69 66 let term = 70 - Term.(const print_result_with_section 67 + Term.( 68 + const print_result_with_section 71 69 $ Publicsuffix_cmd.registrable_section_term (psl ())) 72 70 in 73 71 Cmd.v info term ··· 76 74 let doc = "Get the public suffix with section information" in 77 75 let info = Cmd.info "suffix_section" ~doc in 78 76 let term = 79 - Term.(const print_result_with_section 77 + Term.( 78 + const print_result_with_section 80 79 $ Publicsuffix_cmd.suffix_section_term (psl ())) 81 80 in 82 81 Cmd.v info term ··· 85 84 let doc = "Print statistics about the Public Suffix List" in 86 85 let info = Cmd.info "stats" ~doc in 87 86 let term = 88 - Term.(const (fun (total, icann, private_rules) -> 89 - Printf.printf "Total rules: %d\n" total; 90 - Printf.printf "ICANN rules: %d\n" icann; 91 - Printf.printf "Private rules: %d\n" private_rules) 87 + Term.( 88 + const (fun (total, icann, private_rules) -> 89 + Printf.printf "Total rules: %d\n" total; 90 + Printf.printf "ICANN rules: %d\n" icann; 91 + Printf.printf "Private rules: %d\n" private_rules) 92 92 $ Publicsuffix_cmd.stats_term (psl ())) 93 93 in 94 94 Cmd.v info term ··· 97 97 let doc = "Print version information about the Public Suffix List data" in 98 98 let info = Cmd.info "version" ~doc in 99 99 let term = 100 - Term.(const (fun (version, commit) -> 101 - Printf.printf "Version: %s\n" version; 102 - Printf.printf "Commit: %s\n" commit) 100 + Term.( 101 + const (fun (version, commit) -> 102 + Printf.printf "Version: %s\n" version; 103 + Printf.printf "Commit: %s\n" commit) 103 104 $ Publicsuffix_cmd.version_term (psl ())) 104 105 in 105 106 Cmd.v info term ··· 108 109 let doc = "Query the Public Suffix List" in 109 110 let sdocs = Manpage.s_common_options in 110 111 let info = Cmd.info "publicsuffix" ~version:"%%VERSION%%" ~doc ~sdocs in 111 - Cmd.group info [ 112 - registrable_cmd; 113 - suffix_cmd; 114 - is_suffix_cmd; 115 - is_registrable_cmd; 116 - registrable_section_cmd; 117 - suffix_section_cmd; 118 - stats_cmd; 119 - version_cmd; 120 - ] 112 + Cmd.group info 113 + [ 114 + registrable_cmd; 115 + suffix_cmd; 116 + is_suffix_cmd; 117 + is_registrable_cmd; 118 + registrable_section_cmd; 119 + suffix_section_cmd; 120 + stats_cmd; 121 + version_cmd; 122 + ] 121 123 122 124 let () = exit (Cmd.eval default_cmd)
+1
dune
··· 1 1 ; Root dune file 2 2 3 3 ; Ignore third_party directory (for fetched dependency sources) 4 + 4 5 (data_only_dirs third_party)
+75 -65
gen/gen_psl.ml
··· 26 26 (** Rule types *) 27 27 type rule_type = Normal | Wildcard | Exception 28 28 29 - (** A parsed rule *) 30 29 type rule = { 31 - labels : string list; (* Labels in reverse order: ["uk"; "co"] for co.uk *) 30 + labels : string list; (* Labels in reverse order: ["uk"; "co"] for co.uk *) 32 31 rule_type : rule_type; 33 32 section : section; 34 33 } 34 + (** A parsed rule *) 35 35 36 - (** Trie node for efficient lookup *) 37 36 type trie_node = { 38 37 id : int; (* Unique identifier for this node *) 39 38 mutable rule : (rule_type * section) option; 40 39 mutable children : (string * trie_node) list; 41 40 mutable wildcard_child : trie_node option; 42 41 } 42 + (** Trie node for efficient lookup *) 43 43 44 44 let node_id_counter = ref 0 45 45 ··· 53 53 (* Strip comments (looking for //) *) 54 54 let line = 55 55 match String.index_opt line '/' with 56 - | Some i when i > 0 && line.[i-1] = '/' -> String.sub line 0 (i-1) 56 + | Some i when i > 0 && line.[i - 1] = '/' -> String.sub line 0 (i - 1) 57 57 | Some 0 -> "" 58 58 | _ -> line 59 59 in 60 60 (* Take only up to first whitespace and trim *) 61 61 let line = 62 - String.trim line 63 - |> fun s -> 64 - match String.index_from_opt s 0 ' ', String.index_from_opt s 0 '\t' with 62 + String.trim line |> fun s -> 63 + match (String.index_from_opt s 0 ' ', String.index_from_opt s 0 '\t') with 65 64 | Some i, Some j -> String.sub s 0 (min i j) 66 65 | Some i, None | None, Some i -> String.sub s 0 i 67 66 | None, None -> s ··· 75 74 (Exception, String.sub line 1 (String.length line - 1)) 76 75 else if String.length line > 2 && line.[0] = '*' && line.[1] = '.' then 77 76 (Wildcard, String.sub line 2 (String.length line - 2)) 78 - else 79 - (Normal, line) 77 + else (Normal, line) 80 78 in 81 79 (* Process labels: split, reverse, filter, and encode *) 82 80 let labels = ··· 88 86 | Ok encoded -> String.lowercase_ascii encoded 89 87 | Error _ -> String.lowercase_ascii label) 90 88 in 91 - if labels = [] then None 92 - else Some { labels; rule_type; section } 89 + if labels = [] then None else Some { labels; rule_type; section } 93 90 94 91 (** Insert a rule into the trie *) 95 92 let insert_rule trie rule = ··· 108 105 c 109 106 in 110 107 child.rule <- Some (Wildcard, rule.section) 111 - end else 112 - node.rule <- Some (rule.rule_type, rule.section) 108 + end 109 + else node.rule <- Some (rule.rule_type, rule.section) 113 110 | label :: rest -> 114 111 (* Find or create child for this label *) 115 112 let child = ··· 144 141 (* Helper to extract value after "KEY: " pattern *) 145 142 let extract_value line prefix = 146 143 let prefix_len = String.length prefix in 147 - if String.length line > prefix_len && 148 - String.sub line 0 prefix_len = prefix then 149 - Some (String.trim (String.sub line prefix_len (String.length line - prefix_len))) 150 - else 151 - None 144 + if String.length line > prefix_len && String.sub line 0 prefix_len = prefix 145 + then 146 + Some 147 + (String.trim 148 + (String.sub line prefix_len (String.length line - prefix_len))) 149 + else None 152 150 in 153 151 try 154 152 while true do 155 153 let line = input_line ic in 156 154 (* Check for version and commit info *) 157 - if !version = None then 158 - version := extract_value line "// VERSION: "; 159 - if !commit = None then 160 - commit := extract_value line "// COMMIT: "; 155 + if !version = None then version := extract_value line "// VERSION: "; 156 + if !commit = None then commit := extract_value line "// COMMIT: "; 161 157 (* Check for section markers *) 162 158 if contains_substring line "===BEGIN ICANN DOMAINS===" then 163 159 current_section := ICANN 164 160 else if contains_substring line "===BEGIN PRIVATE DOMAINS===" then 165 161 current_section := Private 166 162 else 167 - Option.iter (fun rule -> 163 + Option.iter 164 + (fun rule -> 168 165 insert_rule trie rule; 169 166 incr rule_count; 170 167 if rule.section = ICANN then incr icann_count 171 - else incr private_count 172 - ) (parse_line !current_section line) 168 + else incr private_count) 169 + (parse_line !current_section line) 173 170 done; 174 171 (trie, !rule_count, !icann_count, !private_count, !version, !commit) 175 172 with End_of_file -> ··· 179 176 (** Escape a string for OCaml source code *) 180 177 let escape_string s = 181 178 let b = Buffer.create (String.length s * 2) in 182 - String.iter (fun c -> 183 - match c with 184 - | '"' -> Buffer.add_string b "\\\"" 185 - | '\\' -> Buffer.add_string b "\\\\" 186 - | '\n' -> Buffer.add_string b "\\n" 187 - | '\r' -> Buffer.add_string b "\\r" 188 - | '\t' -> Buffer.add_string b "\\t" 189 - | c when Char.code c < 32 || Char.code c > 126 -> 190 - (* For non-ASCII, we keep the UTF-8 bytes as-is since OCaml handles UTF-8 strings *) 191 - Buffer.add_char b c 192 - | c -> Buffer.add_char b c 193 - ) s; 179 + String.iter 180 + (fun c -> 181 + match c with 182 + | '"' -> Buffer.add_string b "\\\"" 183 + | '\\' -> Buffer.add_string b "\\\\" 184 + | '\n' -> Buffer.add_string b "\\n" 185 + | '\r' -> Buffer.add_string b "\\r" 186 + | '\t' -> Buffer.add_string b "\\t" 187 + | c when Char.code c < 32 || Char.code c > 126 -> 188 + (* For non-ASCII, we keep the UTF-8 bytes as-is since OCaml handles UTF-8 strings *) 189 + Buffer.add_char b c 190 + | c -> Buffer.add_char b c) 191 + s; 194 192 Buffer.contents b 195 193 196 194 (** Generate OCaml code for the trie *) 197 195 let generate_code trie rule_count icann_count private_count version commit = 198 196 (* Print header *) 199 - print_string {|(* Auto-generated from public_suffix_list.dat - DO NOT EDIT *) 197 + print_string 198 + {|(* Auto-generated from public_suffix_list.dat - DO NOT EDIT *) 200 199 (* This file contains the parsed Public Suffix List as OCaml data structures *) 201 200 202 201 (** Section of the PSL where a rule originates *) ··· 239 238 240 239 let rec generate_node node = 241 240 let node_id = node.id in 242 - if Hashtbl.mem generated node_id then 243 - Hashtbl.find node_names node_id 241 + if Hashtbl.mem generated node_id then Hashtbl.find node_names node_id 244 242 else begin 245 243 (* First generate all children *) 246 244 List.iter (fun (_, child) -> ignore (generate_node child)) node.children; 247 - Option.iter (fun child -> ignore (generate_node child)) node.wildcard_child; 245 + Option.iter 246 + (fun child -> ignore (generate_node child)) 247 + node.wildcard_child; 248 248 249 249 let name = Hashtbl.find node_names node_id in 250 250 ··· 253 253 254 254 (* Rule field *) 255 255 (match node.rule with 256 - | None -> Buffer.add_string output_buffer " rule = None;\n" 257 - | Some (rt, sec) -> 258 - let rt_str = match rt with 259 - | Normal -> "Normal" 260 - | Wildcard -> "Wildcard" 261 - | Exception -> "Exception" 262 - in 263 - let sec_str = match sec with ICANN -> "ICANN" | Private -> "Private" in 264 - Buffer.add_string output_buffer 265 - (Printf.sprintf " rule = Some (%s, %s);\n" rt_str sec_str)); 256 + | None -> Buffer.add_string output_buffer " rule = None;\n" 257 + | Some (rt, sec) -> 258 + let rt_str = 259 + match rt with 260 + | Normal -> "Normal" 261 + | Wildcard -> "Wildcard" 262 + | Exception -> "Exception" 263 + in 264 + let sec_str = 265 + match sec with ICANN -> "ICANN" | Private -> "Private" 266 + in 267 + Buffer.add_string output_buffer 268 + (Printf.sprintf " rule = Some (%s, %s);\n" rt_str sec_str)); 266 269 267 270 (* Children field *) 268 271 if node.children = [] then 269 272 Buffer.add_string output_buffer " children = [];\n" 270 273 else begin 271 274 Buffer.add_string output_buffer " children = [\n"; 272 - List.iter (fun (label, child) -> 273 - let child_name = Hashtbl.find node_names child.id in 274 - Buffer.add_string output_buffer 275 - (Printf.sprintf " (\"%s\", %s);\n" (escape_string label) child_name) 276 - ) node.children; 275 + List.iter 276 + (fun (label, child) -> 277 + let child_name = Hashtbl.find node_names child.id in 278 + Buffer.add_string output_buffer 279 + (Printf.sprintf " (\"%s\", %s);\n" (escape_string label) 280 + child_name)) 281 + node.children; 277 282 Buffer.add_string output_buffer " ];\n" 278 283 end; 279 284 280 285 (* Wildcard child field *) 281 286 (match node.wildcard_child with 282 - | None -> Buffer.add_string output_buffer " wildcard_child = None;\n" 283 - | Some child -> 284 - let child_name = Hashtbl.find node_names child.id in 285 - Buffer.add_string output_buffer 286 - (Printf.sprintf " wildcard_child = Some %s;\n" child_name)); 287 + | None -> Buffer.add_string output_buffer " wildcard_child = None;\n" 288 + | Some child -> 289 + let child_name = Hashtbl.find node_names child.id in 290 + Buffer.add_string output_buffer 291 + (Printf.sprintf " wildcard_child = Some %s;\n" child_name)); 287 292 288 293 Buffer.add_string output_buffer "}\n\n"; 289 294 ··· 297 302 Printf.printf "let root = %s\n" root_name; 298 303 299 304 (* Generate helper to get the root *) 300 - print_string {| 305 + print_string 306 + {| 301 307 (** Get the root of the suffix trie *) 302 308 let get_root () = root 303 309 ··· 316 322 exit 1 317 323 end; 318 324 let filename = Sys.argv.(1) in 319 - let trie, rule_count, icann_count, private_count, version, commit = parse_file filename in 325 + let trie, rule_count, icann_count, private_count, version, commit = 326 + parse_file filename 327 + in 320 328 (* Ensure version and commit are present *) 321 - let version = match version with 329 + let version = 330 + match version with 322 331 | Some v -> v 323 332 | None -> 324 333 Printf.eprintf "ERROR: VERSION not found in %s\n" filename; 325 334 exit 1 326 335 in 327 - let commit = match commit with 336 + let commit = 337 + match commit with 328 338 | Some c -> c 329 339 | None -> 330 340 Printf.eprintf "ERROR: COMMIT not found in %s\n" filename;
+20 -16
lib/cmd/publicsuffix_cmd.ml
··· 14 14 (* Term functions *) 15 15 16 16 let registrable_term psl = 17 - Term.(const (fun domain -> Publicsuffix.registrable_domain psl domain) 17 + Term.( 18 + const (fun domain -> Publicsuffix.registrable_domain psl domain) 18 19 $ domain_arg) 19 20 20 21 let suffix_term psl = 21 - Term.(const (fun domain -> Publicsuffix.public_suffix psl domain) 22 - $ domain_arg) 22 + Term.( 23 + const (fun domain -> Publicsuffix.public_suffix psl domain) $ domain_arg) 23 24 24 25 let is_suffix_term psl = 25 - Term.(const (fun domain -> Publicsuffix.is_public_suffix psl domain) 26 - $ domain_arg) 26 + Term.( 27 + const (fun domain -> Publicsuffix.is_public_suffix psl domain) $ domain_arg) 27 28 28 29 let is_registrable_term psl = 29 - Term.(const (fun domain -> Publicsuffix.is_registrable_domain psl domain) 30 + Term.( 31 + const (fun domain -> Publicsuffix.is_registrable_domain psl domain) 30 32 $ domain_arg) 31 33 32 34 let registrable_section_term psl = 33 - Term.(const (fun domain -> 34 - Publicsuffix.registrable_domain_with_section psl domain) 35 + Term.( 36 + const (fun domain -> 37 + Publicsuffix.registrable_domain_with_section psl domain) 35 38 $ domain_arg) 36 39 37 40 let suffix_section_term psl = 38 - Term.(const (fun domain -> Publicsuffix.public_suffix_with_section psl domain) 41 + Term.( 42 + const (fun domain -> Publicsuffix.public_suffix_with_section psl domain) 39 43 $ domain_arg) 40 44 41 45 let stats_term psl = 42 - Term.(const (fun () -> 43 - (Publicsuffix.rule_count psl, 44 - Publicsuffix.icann_rule_count psl, 45 - Publicsuffix.private_rule_count psl)) 46 + Term.( 47 + const (fun () -> 48 + ( Publicsuffix.rule_count psl, 49 + Publicsuffix.icann_rule_count psl, 50 + Publicsuffix.private_rule_count psl )) 46 51 $ const ()) 47 52 48 53 let version_term psl = 49 - Term.(const (fun () -> 50 - (Publicsuffix.version psl, 51 - Publicsuffix.commit psl)) 54 + Term.( 55 + const (fun () -> (Publicsuffix.version psl, Publicsuffix.commit psl)) 52 56 $ const ())
+4 -3
lib/cmd/publicsuffix_cmd.mli
··· 6 6 (** Reusable Cmdliner terms for the publicsuffix library. 7 7 8 8 This module provides argument parsers and term functions that can be 9 - composed to build command-line tools that work with the Public Suffix List. *) 9 + composed to build command-line tools that work with the Public Suffix List. 10 + *) 10 11 11 12 (** {1 Argument terms} *) 12 13 ··· 46 47 (total_rules, icann_rules, private_rules). *) 47 48 48 49 val version_term : Publicsuffix.t -> (string * string) Cmdliner.Term.t 49 - (** Term that returns version information about the Public Suffix List as a tuple of 50 - (version, commit). *) 50 + (** Term that returns version information about the Public Suffix List as a 51 + tuple of (version, commit). *)
+59 -61
lib/publicsuffix.ml
··· 24 24 (* Bring the trie_node type and its fields into scope *) 25 25 open Publicsuffix_data 26 26 27 - type t = { 28 - root : trie_node; 29 - } 27 + type t = { root : trie_node } 30 28 31 29 type error = 32 30 | Empty_domain ··· 42 40 | Leading_dot -> Format.fprintf fmt "Domain has a leading dot" 43 41 | Punycode_error s -> Format.fprintf fmt "Punycode conversion error: %s" s 44 42 | No_public_suffix -> Format.fprintf fmt "No public suffix found" 45 - | Domain_is_public_suffix -> Format.fprintf fmt "Domain is itself a public suffix" 43 + | Domain_is_public_suffix -> 44 + Format.fprintf fmt "Domain is itself a public suffix" 46 45 47 - let error_to_string err = 48 - Format.asprintf "%a" pp_error err 49 - 50 - let create () = 51 - { root = Publicsuffix_data.get_root () } 46 + let error_to_string err = Format.asprintf "%a" pp_error err 47 + let create () = { root = Publicsuffix_data.get_root () } 52 48 53 49 (* Find a child node by label (case-insensitive) *) 54 50 let find_child (node : trie_node) label = 55 51 let label_lower = String.lowercase_ascii label in 56 - List.find_opt (fun (l, _) -> String.lowercase_ascii l = label_lower) node.children 52 + List.find_opt 53 + (fun (l, _) -> String.lowercase_ascii l = label_lower) 54 + node.children 57 55 |> Option.map snd 58 56 59 - (** Result of matching a domain against the trie *) 60 57 type match_result = { 61 - matched_labels : int; (* Number of labels matched *) 62 - section : section; (* Section of the rule *) 63 - is_exception : bool; (* Whether this is an exception rule *) 58 + matched_labels : int; (* Number of labels matched *) 59 + section : section; (* Section of the rule *) 60 + is_exception : bool; (* Whether this is an exception rule *) 64 61 } 62 + (** Result of matching a domain against the trie *) 65 63 66 - (** Find all matching rules for a domain. 67 - Labels should be in reverse order (TLD first). *) 64 + (** Find all matching rules for a domain. Labels should be in reverse order (TLD 65 + first). *) 68 66 let find_matches (root : trie_node) labels = 69 67 let matches = ref [] in 70 68 71 69 (* Track whether we matched the implicit * rule *) 72 - let implicit_match = { 73 - matched_labels = 1; 74 - section = ICANN; (* Implicit rule is considered ICANN *) 75 - is_exception = false; 76 - } in 70 + let implicit_match = 71 + { 72 + matched_labels = 1; 73 + section = ICANN; 74 + (* Implicit rule is considered ICANN *) 75 + is_exception = false; 76 + } 77 + in 77 78 78 79 let rec traverse (node : trie_node) depth remaining_labels = 79 80 (* Check if current node has a rule *) 80 - Option.iter (fun (rt, sec) -> 81 - let m = { 82 - matched_labels = depth; 83 - section = sec; 84 - is_exception = (rt = Exception); 85 - } in 86 - matches := m :: !matches 87 - ) node.rule; 81 + Option.iter 82 + (fun (rt, sec) -> 83 + let m = 84 + { 85 + matched_labels = depth; 86 + section = sec; 87 + is_exception = rt = Exception; 88 + } 89 + in 90 + matches := m :: !matches) 91 + node.rule; 88 92 89 93 (* Continue traversing if we have more labels *) 90 94 match remaining_labels with ··· 93 97 (* Check for wildcard match *) 94 98 node.wildcard_child 95 99 |> Option.iter (fun wc -> 96 - Option.iter (fun (rt, sec) -> 97 - let m = { 98 - matched_labels = depth + 1; 99 - section = sec; 100 - is_exception = (rt = Exception); 101 - } in 102 - matches := m :: !matches 103 - ) wc.rule); 100 + Option.iter 101 + (fun (rt, sec) -> 102 + let m = 103 + { 104 + matched_labels = depth + 1; 105 + section = sec; 106 + is_exception = rt = Exception; 107 + } 108 + in 109 + matches := m :: !matches) 110 + wc.rule); 104 111 105 112 (* Check for exact label match *) 106 113 find_child node label ··· 110 117 traverse root 0 labels; 111 118 112 119 (* If no matches, return the implicit * rule *) 113 - if !matches = [] then [implicit_match] 114 - else !matches 120 + if !matches = [] then [ implicit_match ] else !matches 115 121 116 - (** Select the prevailing rule from a list of matches. 117 - Per the algorithm: 118 - 1. Exception rules take priority 119 - 2. Otherwise, the rule with the most labels wins 120 - *) 122 + (** Select the prevailing rule from a list of matches. Per the algorithm: 1. 123 + Exception rules take priority 2. Otherwise, the rule with the most labels 124 + wins *) 121 125 let select_prevailing_rule matches = 122 126 match List.find_opt (fun m -> m.is_exception) matches with 123 - | Some ex -> ex (* Exception rules take priority *) 127 + | Some ex -> ex (* Exception rules take priority *) 124 128 | None -> 125 129 (* Find the rule with the most labels *) 126 - List.fold_left (fun best m -> 127 - if m.matched_labels > best.matched_labels then m else best 128 - ) (List.hd matches) matches 130 + List.fold_left 131 + (fun best m -> 132 + if m.matched_labels > best.matched_labels then m else best) 133 + (List.hd matches) matches 129 134 130 135 (** Normalize a domain for lookup: 131 136 - Convert to lowercase 132 137 - Convert IDN to Punycode 133 138 - Split into labels 134 - - Handle trailing dots 135 - *) 139 + - Handle trailing dots *) 136 140 let normalize_domain domain = 137 141 if domain = "" then Error Empty_domain 138 142 else if String.length domain > 0 && domain.[0] = '.' then Error Leading_dot ··· 142 146 String.length domain > 0 && domain.[String.length domain - 1] = '.' 143 147 in 144 148 let domain = 145 - if has_trailing_dot then 146 - String.sub domain 0 (String.length domain - 1) 149 + if has_trailing_dot then String.sub domain 0 (String.length domain - 1) 147 150 else domain 148 151 in 149 152 if domain = "" then Error Empty_domain ··· 171 174 (** Take the rightmost n elements from a list *) 172 175 let take_last n lst = 173 176 let len = List.length lst in 174 - if len <= n then lst 175 - else List.filteri (fun i _ -> i >= len - n) lst 177 + if len <= n then lst else List.filteri (fun i _ -> i >= len - n) lst 176 178 177 179 (** Calculate the number of public suffix labels from a prevailing rule *) 178 180 let suffix_label_count prevailing = 179 181 if prevailing.is_exception then 180 182 (* Exception rules: remove leftmost label from the rule *) 181 183 prevailing.matched_labels - 1 182 - else 183 - prevailing.matched_labels 184 + else prevailing.matched_labels 184 185 185 186 (** Find the prevailing rule for a domain *) 186 187 let find_prevailing_rule t labels = ··· 194 195 | Ok (labels, has_trailing_dot) -> 195 196 let prevailing = find_prevailing_rule t labels in 196 197 let count = suffix_label_count prevailing in 197 - if count > List.length labels then 198 - Error No_public_suffix 198 + if count > List.length labels then Error No_public_suffix 199 199 else 200 200 let suffix_labels = take_last count labels in 201 201 let suffix = labels_to_domain suffix_labels has_trailing_dot in ··· 212 212 let count = suffix_label_count prevailing in 213 213 (* Registrable domain = suffix + 1 label *) 214 214 let reg_label_count = count + 1 in 215 - if reg_label_count > List.length labels then 216 - Error Domain_is_public_suffix 215 + if reg_label_count > List.length labels then Error Domain_is_public_suffix 217 216 else 218 217 let reg_labels = take_last reg_label_count labels in 219 218 let reg_domain = labels_to_domain reg_labels has_trailing_dot in ··· 244 243 let rule_count _t = Publicsuffix_data.rule_count 245 244 let icann_rule_count _t = Publicsuffix_data.icann_rule_count 246 245 let private_rule_count _t = Publicsuffix_data.private_rule_count 247 - 248 246 let version _t = Publicsuffix_data.version 249 247 let commit _t = Publicsuffix_data.commit
+112 -117
lib/publicsuffix.mli
··· 5 5 6 6 (** Public Suffix List implementation for OCaml 7 7 8 - This library provides functions to query the Mozilla Public Suffix List (PSL) 9 - to determine public suffixes and registrable domains. It implements the 10 - algorithm specified at {{:https://publicsuffix.org/list/} publicsuffix.org}. 8 + This library provides functions to query the Mozilla Public Suffix List 9 + (PSL) to determine public suffixes and registrable domains. It implements 10 + the algorithm specified at 11 + {{:https://publicsuffix.org/list/} publicsuffix.org}. 11 12 12 13 {1 Overview} 13 14 14 15 The Public Suffix List is a cross-vendor initiative to provide an accurate 15 - list of domain name suffixes under which Internet users can directly register 16 - names. A "public suffix" is one under which Internet users can register names. 17 - Some examples of public suffixes are [.com], [.co.uk] and [.pvt.k12.ma.us]. 16 + list of domain name suffixes under which Internet users can directly 17 + register names. A "public suffix" is one under which Internet users can 18 + register names. Some examples of public suffixes are [.com], [.co.uk] and 19 + [.pvt.k12.ma.us]. 18 20 19 - The "registrable domain" is the public suffix plus one additional label. 20 - For example, for the domain [www.example.com], the public suffix is [.com] 21 - and the registrable domain is [example.com]. 21 + The "registrable domain" is the public suffix plus one additional label. For 22 + example, for the domain [www.example.com], the public suffix is [.com] and 23 + the registrable domain is [example.com]. 22 24 23 - Domain names follow the specifications in {{:https://datatracker.ietf.org/doc/html/rfc1034}RFC 1034} 24 - and {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}, which define 25 - the Domain Name System concepts and implementation. 25 + Domain names follow the specifications in 26 + {{:https://datatracker.ietf.org/doc/html/rfc1034}RFC 1034} and 27 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}, which define the 28 + Domain Name System concepts and implementation. 26 29 27 30 {1 Sections} 28 31 ··· 50 53 let psl = Publicsuffix.create () in 51 54 52 55 (* Get the public suffix of a domain *) 53 - Publicsuffix.public_suffix psl "www.example.com" 54 - (* Returns: Ok "com" *) 55 - 56 - Publicsuffix.public_suffix psl "www.example.co.uk" 57 - (* Returns: Ok "co.uk" *) 58 - 59 - (* Get the registrable domain *) 60 - Publicsuffix.registrable_domain psl "www.example.com" 61 - (* Returns: Ok "example.com" *) 62 - 63 - (* Check if a domain is a public suffix *) 64 - Publicsuffix.is_public_suffix psl "com" 65 - (* Returns: Ok true *) 66 - 67 - Publicsuffix.is_public_suffix psl "example.com" 56 + Publicsuffix.public_suffix psl "www.example.com" (* Returns: Ok "com" *) 57 + Publicsuffix.public_suffix psl 58 + "www.example.co.uk" (* Returns: Ok "co.uk" *) 59 + (* Get the registrable domain *) 60 + Publicsuffix.registrable_domain psl 61 + "www.example.com" (* Returns: Ok "example.com" *) 62 + (* Check if a domain is a public suffix *) 63 + Publicsuffix.is_public_suffix psl "com" (* Returns: Ok true *) 64 + Publicsuffix.is_public_suffix psl "example.com" 68 65 (* Returns: Ok false *) 69 66 ]} 70 67 71 68 {1 Internationalized Domain Names} 72 69 73 70 The library handles internationalized domain names (IDN) by converting them 74 - to Punycode (ASCII-compatible encoding) before lookup, following the IDNA2008 75 - protocol defined in {{:https://datatracker.ietf.org/doc/html/rfc5890}RFC 5890} 76 - (IDNA Definitions) and {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} 71 + to Punycode (ASCII-compatible encoding) before lookup, following the 72 + IDNA2008 protocol defined in 73 + {{:https://datatracker.ietf.org/doc/html/rfc5890}RFC 5890} (IDNA 74 + Definitions) and {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} 77 75 (IDNA Protocol). The conversion is performed using [Punycode_idna.to_ascii]. 78 76 79 - Punycode encoding, specified in {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, 80 - uniquely and reversibly transforms Unicode strings into ASCII-compatible 81 - strings using the "xn--" prefix (ACE prefix). See the [Punycode] library for 82 - the core encoding implementation. Both Unicode and Punycode input are accepted: 77 + Punycode encoding, specified in 78 + {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, uniquely and 79 + reversibly transforms Unicode strings into ASCII-compatible strings using 80 + the "xn--" prefix (ACE prefix). See the [Punycode] library for the core 81 + encoding implementation. Both Unicode and Punycode input are accepted: 83 82 84 83 {[ 85 - Publicsuffix.registrable_domain psl "www.食狮.com.cn" 86 - (* Returns: Ok "食狮.com.cn" *) 87 - 88 - Publicsuffix.registrable_domain psl "www.xn--85x722f.com.cn" 84 + Publicsuffix.registrable_domain psl 85 + "www.食狮.com.cn" (* Returns: Ok "食狮.com.cn" *) 86 + Publicsuffix.registrable_domain psl "www.xn--85x722f.com.cn" 89 87 (* Returns: Ok "xn--85x722f.com.cn" *) 90 88 ]} 91 89 ··· 95 93 names) are preserved in the output: 96 94 97 95 {[ 98 - Publicsuffix.public_suffix psl "example.com" 99 - (* Returns: Ok "com" *) 100 - 101 - Publicsuffix.public_suffix psl "example.com." 96 + Publicsuffix.public_suffix psl "example.com" (* Returns: Ok "com" *) 97 + Publicsuffix.public_suffix psl "example.com." 102 98 (* Returns: Ok "com." *) 103 99 ]} 104 100 ··· 106 102 107 103 This library implementation is based on the following specifications: 108 104 109 - {ul 110 - {- {{:https://publicsuffix.org/list/} Public Suffix List Specification} - The algorithm and list format} 111 - {- {{:https://datatracker.ietf.org/doc/html/rfc1034}RFC 1034} - Domain Names: Concepts and Facilities} 112 - {- {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035} - Domain Names: Implementation and Specification} 113 - {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA} 114 - {- {{:https://datatracker.ietf.org/doc/html/rfc5890}RFC 5890} - Internationalized Domain Names for Applications (IDNA): Definitions} 115 - {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - Internationalized Domain Names in Applications (IDNA): Protocol}} 105 + - {{:https://publicsuffix.org/list/} Public Suffix List Specification} - The 106 + algorithm and list format 107 + - {{:https://datatracker.ietf.org/doc/html/rfc1034}RFC 1034} - Domain Names: 108 + Concepts and Facilities 109 + - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035} - Domain Names: 110 + Implementation and Specification 111 + - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A 112 + Bootstring encoding of Unicode for IDNA 113 + - {{:https://datatracker.ietf.org/doc/html/rfc5890}RFC 5890} - 114 + Internationalized Domain Names for Applications (IDNA): Definitions 115 + - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - 116 + Internationalized Domain Names in Applications (IDNA): Protocol 116 117 117 118 {1 Related Libraries} 118 119 119 - {ul 120 - {- [Punycode] - Core Punycode encoding/decoding implementation} 121 - {- [Punycode_idna] - IDNA ToASCII/ToUnicode operations used for IDN conversion}} 122 - *) 120 + - [Punycode] - Core Punycode encoding/decoding implementation 121 + - [Punycode_idna] - IDNA ToASCII/ToUnicode operations used for IDN 122 + conversion *) 123 123 124 124 (** {1 Types} *) 125 125 126 126 (** Section of the Public Suffix List where a rule originates *) 127 127 type section = 128 - | ICANN (** Domains delegated by ICANN or in the IANA root zone *) 128 + | ICANN (** Domains delegated by ICANN or in the IANA root zone *) 129 129 | Private (** Domains submitted by private parties *) 130 130 131 - (** A handle to the parsed Public Suffix List *) 132 131 type t 132 + (** A handle to the parsed Public Suffix List *) 133 133 134 134 (** {1 Errors} *) 135 135 136 136 (** Errors that can occur during PSL operations *) 137 137 type error = 138 - | Empty_domain 139 - (** The input domain was empty *) 138 + | Empty_domain (** The input domain was empty *) 140 139 | Invalid_domain of string 141 - (** The domain could not be parsed as a valid domain name. 142 - Domain names must conform to the syntax specified in 140 + (** The domain could not be parsed as a valid domain name. Domain names 141 + must conform to the syntax specified in 143 142 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 144 143 | Leading_dot 145 - (** The domain has a leading dot (e.g., [.example.com]). 146 - Per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}, 147 - domain names should not have leading dots. *) 144 + (** The domain has a leading dot (e.g., [.example.com]). Per 145 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}, domain 146 + names should not have leading dots. *) 148 147 | Punycode_error of string 149 - (** Failed to convert internationalized domain to Punycode encoding. 150 - The string contains the error message from [Punycode_idna]. 151 - See {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} 152 - for Punycode encoding requirements and 153 - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} 154 - for IDNA protocol requirements. *) 148 + (** Failed to convert internationalized domain to Punycode encoding. The 149 + string contains the error message from [Punycode_idna]. See 150 + {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} for 151 + Punycode encoding requirements and 152 + {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} for IDNA 153 + protocol requirements. *) 155 154 | No_public_suffix 156 - (** The domain has no public suffix (should not happen with valid domains) *) 155 + (** The domain has no public suffix (should not happen with valid domains) 156 + *) 157 157 | Domain_is_public_suffix 158 158 (** The domain is itself a public suffix and has no registrable domain *) 159 159 160 - (** Pretty-print an error *) 161 160 val pp_error : Format.formatter -> error -> unit 161 + (** Pretty-print an error *) 162 162 163 + val error_to_string : error -> string 163 164 (** Convert an error to a human-readable string *) 164 - val error_to_string : error -> string 165 165 166 166 (** {1 Creation} *) 167 167 168 - (** Create a PSL instance using the embedded Public Suffix List data. 169 - The data is compiled into the library at build time. *) 170 168 val create : unit -> t 169 + (** Create a PSL instance using the embedded Public Suffix List data. The data 170 + is compiled into the library at build time. *) 171 171 172 172 (** {1 Core Operations} *) 173 173 174 + val public_suffix : t -> string -> (string, error) result 174 175 (** [public_suffix t domain] returns the public suffix portion of [domain]. 175 176 176 177 The public suffix is determined by the PSL algorithm: ··· 180 181 181 182 Domain names are processed according to 182 183 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035} syntax. 183 - Internationalized domain names (IDN) are automatically converted to 184 - Punycode per {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} 185 - before matching. 184 + Internationalized domain names (IDN) are automatically converted to Punycode 185 + per {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} before 186 + matching. 186 187 187 188 @param t The PSL instance 188 189 @param domain The domain name to query (Unicode or Punycode) ··· 192 193 - [public_suffix t "www.example.com"] returns [Ok "com"] 193 194 - [public_suffix t "www.example.co.uk"] returns [Ok "co.uk"] 194 195 - [public_suffix t "test.k12.ak.us"] returns [Ok "k12.ak.us"] 195 - - [public_suffix t "city.kobe.jp"] returns [Ok "jp"] (exception rule) 196 - *) 197 - val public_suffix : t -> string -> (string, error) result 196 + - [public_suffix t "city.kobe.jp"] returns [Ok "jp"] (exception rule) *) 198 197 198 + val public_suffix_with_section : t -> string -> (string * section, error) result 199 199 (** [public_suffix_with_section t domain] is like {!public_suffix} but also 200 200 returns the section (ICANN or Private) where the matching rule was found. 201 201 202 - If the implicit [*] rule was used (no explicit rule matched), the section 203 - is [ICANN]. 202 + If the implicit [*] rule was used (no explicit rule matched), the section is 203 + [ICANN]. 204 204 205 - @return [Ok (suffix, section)] or [Error e] on failure 206 - *) 207 - val public_suffix_with_section : t -> string -> (string * section, error) result 205 + @return [Ok (suffix, section)] or [Error e] on failure *) 208 206 207 + val registrable_domain : t -> string -> (string, error) result 209 208 (** [registrable_domain t domain] returns the registrable domain portion. 210 209 211 - The registrable domain is the public suffix plus one additional label. 212 - This is the highest-level domain that can be registered by a user. 210 + The registrable domain is the public suffix plus one additional label. This 211 + is the highest-level domain that can be registered by a user. 213 212 214 213 Domain labels follow the naming restrictions specified in 215 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. Internationalized 216 - domain names are handled per {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891}. 214 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. 215 + Internationalized domain names are handled per 216 + {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891}. 217 217 218 218 @param t The PSL instance 219 219 @param domain The domain name to query ··· 225 225 Examples: 226 226 - [registrable_domain t "www.example.com"] returns [Ok "example.com"] 227 227 - [registrable_domain t "example.com"] returns [Ok "example.com"] 228 - - [registrable_domain t "com"] returns [Error Domain_is_public_suffix] 229 - *) 230 - val registrable_domain : t -> string -> (string, error) result 228 + - [registrable_domain t "com"] returns [Error Domain_is_public_suffix] *) 231 229 232 - (** [registrable_domain_with_section t domain] is like {!registrable_domain} 233 - but also returns the section where the matching rule was found. 230 + val registrable_domain_with_section : 231 + t -> string -> (string * section, error) result 232 + (** [registrable_domain_with_section t domain] is like {!registrable_domain} but 233 + also returns the section where the matching rule was found. 234 234 235 - @return [Ok (domain, section)] or [Error e] on failure 236 - *) 237 - val registrable_domain_with_section : t -> string -> (string * section, error) result 235 + @return [Ok (domain, section)] or [Error e] on failure *) 238 236 239 237 (** {1 Predicates} *) 240 238 241 - (** [is_public_suffix t domain] returns [true] if [domain] is exactly a 242 - public suffix according to the PSL. 239 + val is_public_suffix : t -> string -> (bool, error) result 240 + (** [is_public_suffix t domain] returns [true] if [domain] is exactly a public 241 + suffix according to the PSL. 243 242 244 - Note: This returns [true] if the domain matches a rule exactly, not if 245 - it's under a wildcard. For example: 243 + Note: This returns [true] if the domain matches a rule exactly, not if it's 244 + under a wildcard. For example: 246 245 - [is_public_suffix t "com"] returns [Ok true] 247 246 - [is_public_suffix t "example.com"] returns [Ok false] 248 247 - [is_public_suffix t "foo.ck"] returns [Ok true] (due to [*.ck] rule) 249 - - [is_public_suffix t "www.ck"] returns [Ok false] (due to [!www.ck] exception) 250 - *) 251 - val is_public_suffix : t -> string -> (bool, error) result 248 + - [is_public_suffix t "www.ck"] returns [Ok false] (due to [!www.ck] 249 + exception) *) 252 250 253 - (** [is_registrable_domain t domain] returns [true] if [domain] is exactly 254 - a registrable domain (public suffix plus one label, no more). 251 + val is_registrable_domain : t -> string -> (bool, error) result 252 + (** [is_registrable_domain t domain] returns [true] if [domain] is exactly a 253 + registrable domain (public suffix plus one label, no more). 255 254 256 255 Examples: 257 256 - [is_registrable_domain t "example.com"] returns [Ok true] 258 257 - [is_registrable_domain t "www.example.com"] returns [Ok false] 259 - - [is_registrable_domain t "com"] returns [Ok false] 260 - *) 261 - val is_registrable_domain : t -> string -> (bool, error) result 258 + - [is_registrable_domain t "com"] returns [Ok false] *) 262 259 263 260 (** {1 Statistics} *) 264 261 265 - (** Total number of rules in the embedded PSL *) 266 262 val rule_count : t -> int 263 + (** Total number of rules in the embedded PSL *) 267 264 268 - (** Number of ICANN section rules *) 269 265 val icann_rule_count : t -> int 266 + (** Number of ICANN section rules *) 270 267 271 - (** Number of private section rules *) 272 268 val private_rule_count : t -> int 269 + (** Number of private section rules *) 273 270 274 271 (** {1 Version Information} *) 275 272 273 + val version : t -> string 276 274 (** Version string from the embedded PSL data. 277 275 278 276 Returns the version identifier from the Public Suffix List source file, 279 - typically in the format ["YYYY-MM-DD_HH-MM-SS_UTC"]. 280 - *) 281 - val version : t -> string 277 + typically in the format ["YYYY-MM-DD_HH-MM-SS_UTC"]. *) 282 278 279 + val commit : t -> string 283 280 (** Commit hash from the embedded PSL data. 284 281 285 282 Returns the git commit hash from the Public Suffix List repository 286 - corresponding to the version of the data embedded in this library. 287 - *) 288 - val commit : t -> string 283 + corresponding to the version of the data embedded in this library. *)
+52 -65
lib/publicsuffix_data.mli
··· 11 11 12 12 {1 Public Suffix List Specification} 13 13 14 - The Public Suffix List is maintained by Mozilla and follows the specification 15 - at {{:https://publicsuffix.org/list/} publicsuffix.org}. The list provides 16 - an accurate database of domain name suffixes under which Internet users can 17 - directly register names. 14 + The Public Suffix List is maintained by Mozilla and follows the 15 + specification at {{:https://publicsuffix.org/list/} publicsuffix.org}. The 16 + list provides an accurate database of domain name suffixes under which 17 + Internet users can directly register names. 18 18 19 19 {2 PSL Format and Rules} 20 20 21 21 The PSL defines three types of rules: 22 22 23 - - {b Normal rules}: Standard domain suffixes (e.g., [com], [co.uk]). 24 - These match exactly as written. 23 + - {b Normal rules}: Standard domain suffixes (e.g., [com], [co.uk]). These 24 + match exactly as written. 25 25 26 26 - {b Wildcard rules}: Prefixed with [*.] (e.g., [*.jp]). These match any 27 27 single label in that position. For example, [*.example.com] matches 28 28 [foo.example.com] but not [example.com] or [bar.foo.example.com]. 29 29 30 30 - {b Exception rules}: Prefixed with [!] (e.g., [!city.kobe.jp]). These 31 - override wildcard rules and specify that a particular domain {i is not} 32 - a public suffix despite a matching wildcard. 31 + override wildcard rules and specify that a particular domain {i is not} a 32 + public suffix despite a matching wildcard. 33 33 34 34 {2 Sections} 35 35 ··· 48 48 49 49 Per the PSL specification, the matching algorithm: 50 50 51 - 1. Matches the domain against all rules in the list 52 - 2. If no rules match, applies the implicit [*] wildcard rule 53 - 3. If multiple rules match, exception rules take priority 54 - 4. Otherwise, the rule with the most labels wins 55 - 5. For exception rules, the public suffix is derived by removing the 56 - exception's leftmost label 51 + 1. Matches the domain against all rules in the list 2. If no rules match, 52 + applies the implicit [*] wildcard rule 3. If multiple rules match, exception 53 + rules take priority 4. Otherwise, the rule with the most labels wins 5. For 54 + exception rules, the public suffix is derived by removing the exception's 55 + leftmost label 57 56 58 57 {1 Data Structure} 59 58 60 59 This module represents the PSL as a trie (prefix tree) data structure for 61 - efficient lookup. The trie is constructed with labels in reverse order 62 - (TLD first), allowing efficient traversal from the top-level domain down 63 - to more specific labels. 60 + efficient lookup. The trie is constructed with labels in reverse order (TLD 61 + first), allowing efficient traversal from the top-level domain down to more 62 + specific labels. 64 63 65 64 All domain labels in the trie are: 66 65 - Converted to lowercase ··· 71 70 72 71 This module is automatically generated during the build process: 73 72 74 - 1. The [gen_psl.ml] code generator reads [public_suffix_list.dat] 75 - 2. It parses each rule according to the PSL specification 76 - 3. It constructs an in-memory trie from all rules 77 - 4. It emits OCaml source code representing the trie 78 - 5. The generated code is compiled into the library 73 + 1. The [gen_psl.ml] code generator reads [public_suffix_list.dat] 2. It 74 + parses each rule according to the PSL specification 3. It constructs an 75 + in-memory trie from all rules 4. It emits OCaml source code representing the 76 + trie 5. The generated code is compiled into the library 79 77 80 - This approach embeds the entire PSL into the compiled library, requiring 81 - no runtime file I/O or parsing. 78 + This approach embeds the entire PSL into the compiled library, requiring no 79 + runtime file I/O or parsing. 82 80 83 81 {1 Interface} 84 82 85 83 This module is internal to the library. The main library API is exposed 86 - through the {!Publicsuffix} module, which provides high-level functions 87 - for querying the PSL data. 88 - *) 84 + through the {!Publicsuffix} module, which provides high-level functions for 85 + querying the PSL data. *) 89 86 90 87 (** {1 Types} *) 91 88 ··· 93 90 94 91 The PSL is divided into two sections with different governance: 95 92 - [ICANN]: Official domains delegated by ICANN or in the IANA root zone 96 - - [Private]: Domains submitted by private parties for their services 97 - *) 93 + - [Private]: Domains submitted by private parties for their services *) 98 94 type section = ICANN | Private 99 95 100 96 (** Rule types defined in the PSL specification. 101 97 102 98 - [Normal]: A standard domain suffix that matches exactly (e.g., [com]) 103 99 - [Wildcard]: A rule with [*.] prefix that matches any single label 104 - - [Exception]: A rule with [!] prefix that overrides wildcard matches 105 - *) 100 + - [Exception]: A rule with [!] prefix that overrides wildcard matches *) 106 101 type rule_type = Normal | Wildcard | Exception 107 102 103 + type trie_node = { 104 + rule : (rule_type * section) option; 105 + children : (string * trie_node) list; 106 + wildcard_child : trie_node option; 107 + } 108 108 (** A node in the suffix trie. 109 109 110 - The trie is constructed with domain labels in reverse order (TLD first). 111 - For example, the domain [example.co.uk] would be traversed as [uk] -> [co] -> [example]. 110 + The trie is constructed with domain labels in reverse order (TLD first). For 111 + example, the domain [example.co.uk] would be traversed as [uk] -> [co] -> 112 + [example]. 112 113 113 114 - [rule]: If [Some (rt, sec)], this node represents a PSL rule of type [rt] 114 115 from section [sec] 115 116 - [children]: List of (label, child_node) pairs for exact label matches 116 117 - [wildcard_child]: If [Some node], this represents a wildcard match ([*]) 117 - at this position in the domain hierarchy 118 - *) 119 - type trie_node = { 120 - rule : (rule_type * section) option; 121 - children : (string * trie_node) list; 122 - wildcard_child : trie_node option; 123 - } 118 + at this position in the domain hierarchy *) 124 119 125 120 (** {1 Data Access} *) 126 121 122 + val get_root : unit -> trie_node 127 123 (** Get the root of the suffix trie. 128 124 129 125 The root node represents the starting point for all PSL lookups. Domain 130 126 labels should be traversed in reverse order (TLD first) from this root. 131 127 132 - @return The root trie node containing all PSL rules 133 - *) 134 - val get_root : unit -> trie_node 128 + @return The root trie node containing all PSL rules *) 135 129 136 130 (** {1 Statistics} 137 131 138 132 These values reflect the PSL data at the time this module was generated. 139 - They include all rules from both the ICANN and Private sections. 140 - *) 133 + They include all rules from both the ICANN and Private sections. *) 141 134 135 + val rule_count : int 142 136 (** Total number of rules in the embedded PSL data. 143 137 144 - This includes all Normal, Wildcard, and Exception rules from both 145 - sections. 138 + This includes all Normal, Wildcard, and Exception rules from both sections. 146 139 *) 147 - val rule_count : int 148 140 141 + val icann_rule_count : int 149 142 (** Number of rules in the ICANN section. 150 143 151 - These are official TLD rules delegated by ICANN or present in the IANA 152 - root zone database. 153 - *) 154 - val icann_rule_count : int 144 + These are official TLD rules delegated by ICANN or present in the IANA root 145 + zone database. *) 155 146 147 + val private_rule_count : int 156 148 (** Number of rules in the Private section. 157 149 158 - These are rules submitted by private organizations for services that 159 - allow subdomain registration. 160 - *) 161 - val private_rule_count : int 150 + These are rules submitted by private organizations for services that allow 151 + subdomain registration. *) 162 152 163 153 (** {1 Version Information} 164 154 165 - These values reflect the version and commit information from the PSL 166 - data at the time this module was generated. 167 - *) 155 + These values reflect the version and commit information from the PSL data at 156 + the time this module was generated. *) 168 157 158 + val version : string 169 159 (** Version string from the PSL data file. 170 160 171 161 This is the version identifier from the Public Suffix List source file, 172 - typically in the format "YYYY-MM-DD_HH-MM-SS_UTC". 173 - *) 174 - val version : string 162 + typically in the format "YYYY-MM-DD_HH-MM-SS_UTC". *) 175 163 164 + val commit : string 176 165 (** Commit hash from the PSL data file. 177 166 178 167 This is the git commit hash from the Public Suffix List repository 179 - corresponding to the version of the data embedded in this library. 180 - *) 181 - val commit : string 168 + corresponding to the version of the data embedded in this library. *)
+13 -11
test/psl_test.ml
··· 15 15 *) 16 16 17 17 let psl = Publicsuffix.create () 18 - 19 18 let print_error e = Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 20 - 21 - let print_result = function 22 - | Ok s -> print_endline s 23 - | Error e -> print_error e 19 + let print_result = function Ok s -> print_endline s | Error e -> print_error e 24 20 25 21 let print_bool_result = function 26 22 | Ok b -> print_endline (string_of_bool b) ··· 28 24 29 25 let print_result_with_section = function 30 26 | Ok (s, sec) -> 31 - let sec_str = match sec with 27 + let sec_str = 28 + match sec with 32 29 | Publicsuffix.ICANN -> "ICANN" 33 30 | Publicsuffix.Private -> "PRIVATE" 34 31 in ··· 41 38 print_endline "Commands:"; 42 39 print_endline " registrable <domain> - Get registrable domain"; 43 40 print_endline " suffix <domain> - Get public suffix"; 44 - print_endline " is_suffix <domain> - Check if domain is a public suffix"; 45 - print_endline " is_registrable <domain> - Check if domain is a registrable domain"; 46 - print_endline " registrable_section <domain> - Get registrable domain with section"; 41 + print_endline 42 + " is_suffix <domain> - Check if domain is a public suffix"; 43 + print_endline 44 + " is_registrable <domain> - Check if domain is a registrable domain"; 45 + print_endline 46 + " registrable_section <domain> - Get registrable domain with section"; 47 47 print_endline " suffix_section <domain> - Get public suffix with section"; 48 48 print_endline " stats - Print rule statistics"; 49 49 exit 1 ··· 58 58 | "is_registrable" when Array.length Sys.argv >= 3 -> 59 59 print_bool_result (Publicsuffix.is_registrable_domain psl Sys.argv.(2)) 60 60 | "registrable_section" when Array.length Sys.argv >= 3 -> 61 - print_result_with_section (Publicsuffix.registrable_domain_with_section psl Sys.argv.(2)) 61 + print_result_with_section 62 + (Publicsuffix.registrable_domain_with_section psl Sys.argv.(2)) 62 63 | "suffix_section" when Array.length Sys.argv >= 3 -> 63 - print_result_with_section (Publicsuffix.public_suffix_with_section psl Sys.argv.(2)) 64 + print_result_with_section 65 + (Publicsuffix.public_suffix_with_section psl Sys.argv.(2)) 64 66 | "stats" -> 65 67 Printf.printf "Total rules: %d\n" (Publicsuffix.rule_count psl); 66 68 Printf.printf "ICANN rules: %d\n" (Publicsuffix.icann_rule_count psl);