OCaml implementation of the Mozilla Public Suffix service

generator fixes

+266 -160
+31 -33
gen/gen_psl.ml
··· 44 44 45 45 (** Parse a single line from the PSL file *) 46 46 let parse_line section line = 47 - (* Strip comments *) 47 + (* Strip comments (looking for //) *) 48 48 let line = 49 49 match String.index_opt line '/' with 50 - | Some i when i > 0 && line.[i-1] = '/' -> 51 - String.sub line 0 (i-1) 50 + | Some i when i > 0 && line.[i-1] = '/' -> String.sub line 0 (i-1) 52 51 | Some 0 -> "" 53 52 | _ -> line 54 53 in 55 - (* Take only up to first whitespace *) 54 + (* Take only up to first whitespace and trim *) 56 55 let line = 57 - let line = String.trim line in 58 - match String.index_opt line ' ' with 59 - | Some i -> String.sub line 0 i 60 - | None -> 61 - match String.index_opt line '\t' with 62 - | Some i -> String.sub line 0 i 63 - | None -> line 56 + String.trim line 57 + |> fun s -> 58 + match String.index_from_opt s 0 ' ', String.index_from_opt s 0 '\t' with 59 + | Some i, Some j -> String.sub s 0 (min i j) 60 + | Some i, None | None, Some i -> String.sub s 0 i 61 + | None, None -> s 64 62 in 65 63 let line = String.trim line in 66 64 if line = "" then None ··· 74 72 else 75 73 (Normal, line) 76 74 in 77 - (* Split into labels and reverse *) 78 - let labels = String.split_on_char '.' domain in 79 - let labels = List.rev labels in 80 - (* Filter out empty labels *) 81 - let labels = List.filter (fun s -> s <> "") labels in 82 - (* Convert each label to lowercase Punycode for canonical comparison *) 83 - let labels = List.map (fun label -> 84 - match Punycode.encode_label label with 85 - | Ok encoded -> String.lowercase_ascii encoded 86 - | Error _ -> String.lowercase_ascii label (* Fallback to lowercase *) 87 - ) labels in 75 + (* Process labels: split, reverse, filter, and encode *) 76 + let labels = 77 + String.split_on_char '.' domain 78 + |> List.rev 79 + |> List.filter (fun s -> s <> "") 80 + |> List.map (fun label -> 81 + match Punycode.encode_label label with 82 + | Ok encoded -> String.lowercase_ascii encoded 83 + | Error _ -> String.lowercase_ascii label) 84 + in 88 85 if labels = [] then None 89 86 else Some { labels; rule_type; section } 90 87 ··· 129 126 let rule_count = ref 0 in 130 127 let icann_count = ref 0 in 131 128 let private_count = ref 0 in 129 + (* Helper to check if string contains substring *) 130 + let contains_substring s sub = 131 + try 132 + let _ = Str.search_forward (Str.regexp_string sub) s 0 in 133 + true 134 + with Not_found -> false 135 + in 132 136 try 133 137 while true do 134 138 let line = input_line ic in 135 - (* Check for section markers - look for distinctive substrings *) 136 - let contains s sub = 137 - try 138 - let _ = Str.search_forward (Str.regexp_string sub) s 0 in true 139 - with Not_found -> false 140 - in 141 - if contains line "===BEGIN ICANN DOMAINS===" then 139 + (* Check for section markers *) 140 + if contains_substring line "===BEGIN ICANN DOMAINS===" then 142 141 current_section := ICANN 143 - else if contains line "===BEGIN PRIVATE DOMAINS===" then 142 + else if contains_substring line "===BEGIN PRIVATE DOMAINS===" then 144 143 current_section := Private 145 144 else 146 - match parse_line !current_section line with 147 - | None -> () 148 - | Some rule -> 145 + Option.iter (fun rule -> 149 146 insert_rule trie rule; 150 147 incr rule_count; 151 148 if rule.section = ICANN then incr icann_count 152 149 else incr private_count 150 + ) (parse_line !current_section line) 153 151 done; 154 152 (trie, !rule_count, !icann_count, !private_count) 155 153 with End_of_file ->
+68 -119
lib/publicsuffix.ml
··· 59 59 (** Result of matching a domain against the trie *) 60 60 type match_result = { 61 61 matched_labels : int; (* Number of labels matched *) 62 - rule_type : rule_type; (* Type of the matching rule *) 63 62 section : section; (* Section of the rule *) 64 63 is_exception : bool; (* Whether this is an exception rule *) 65 64 } ··· 72 71 (* Track whether we matched the implicit * rule *) 73 72 let implicit_match = { 74 73 matched_labels = 1; 75 - rule_type = Wildcard; 76 74 section = ICANN; (* Implicit rule is considered ICANN *) 77 75 is_exception = false; 78 76 } in 79 77 80 78 let rec traverse (node : trie_node) depth remaining_labels = 81 79 (* Check if current node has a rule *) 82 - (match node.rule with 83 - | Some (rt, sec) -> 84 - let m = { 85 - matched_labels = depth; 86 - rule_type = rt; 87 - section = sec; 88 - is_exception = (rt = Exception); 89 - } in 90 - matches := m :: !matches 91 - | None -> ()); 80 + Option.iter (fun (rt, sec) -> 81 + let m = { 82 + matched_labels = depth; 83 + section = sec; 84 + is_exception = (rt = Exception); 85 + } in 86 + matches := m :: !matches 87 + ) node.rule; 92 88 93 89 (* Continue traversing if we have more labels *) 94 90 match remaining_labels with 95 91 | [] -> () 96 92 | label :: rest -> 97 93 (* Check for wildcard match *) 98 - (match node.wildcard_child with 99 - | Some wc -> 100 - (match wc.rule with 101 - | Some (rt, sec) -> 102 - let m = { 103 - matched_labels = depth + 1; 104 - rule_type = rt; 105 - section = sec; 106 - is_exception = (rt = Exception); 107 - } in 108 - matches := m :: !matches 109 - | None -> ()) 110 - | None -> ()); 94 + node.wildcard_child 95 + |> Option.iter (fun wc -> 96 + Option.iter (fun (rt, sec) -> 97 + let m = { 98 + matched_labels = depth + 1; 99 + section = sec; 100 + is_exception = (rt = Exception); 101 + } in 102 + matches := m :: !matches 103 + ) wc.rule); 111 104 112 105 (* Check for exact label match *) 113 - match find_child node label with 114 - | Some child -> traverse child (depth + 1) rest 115 - | None -> () 106 + find_child node label 107 + |> Option.iter (fun child -> traverse child (depth + 1) rest) 116 108 in 117 109 118 110 traverse root 0 labels; ··· 127 119 2. Otherwise, the rule with the most labels wins 128 120 *) 129 121 let select_prevailing_rule matches = 130 - (* First, check for exception rules *) 131 - let exceptions = List.filter (fun m -> m.is_exception) matches in 132 - match exceptions with 133 - | ex :: _ -> ex (* Exception rules take priority *) 134 - | [] -> 122 + match List.find_opt (fun m -> m.is_exception) matches with 123 + | Some ex -> ex (* Exception rules take priority *) 124 + | None -> 135 125 (* Find the rule with the most labels *) 136 126 List.fold_left (fun best m -> 137 127 if m.matched_labels > best.matched_labels then m else best ··· 146 136 let normalize_domain domain = 147 137 if domain = "" then Error Empty_domain 148 138 else if String.length domain > 0 && domain.[0] = '.' then Error Leading_dot 149 - else begin 139 + else 150 140 (* Check for and preserve trailing dot *) 151 141 let has_trailing_dot = 152 142 String.length domain > 0 && domain.[String.length domain - 1] = '.' ··· 164 154 let msg = Format.asprintf "%a" Punycode_idna.pp_error e in 165 155 Error (Punycode_error msg) 166 156 | Ok ascii_domain -> 167 - (* Convert to lowercase *) 157 + (* Convert to lowercase and split into labels *) 168 158 let ascii_lower = String.lowercase_ascii ascii_domain in 169 - (* Split into labels *) 170 - let labels = String.split_on_char '.' ascii_lower in 171 - (* Filter empty labels (shouldn't happen after normalization) *) 172 - let labels = List.filter (fun s -> s <> "") labels in 159 + let labels = 160 + String.split_on_char '.' ascii_lower 161 + |> List.filter (fun s -> s <> "") 162 + in 173 163 if labels = [] then Error Empty_domain 174 164 else Ok (labels, has_trailing_dot) 175 - end 176 165 177 166 (** Convert labels back to a domain string *) 178 167 let labels_to_domain labels has_trailing_dot = 179 168 let domain = String.concat "." labels in 180 169 if has_trailing_dot then domain ^ "." else domain 181 170 171 + (** Take the rightmost n elements from a list *) 172 + let take_last n lst = 173 + let len = List.length lst in 174 + if len <= n then lst 175 + else List.filteri (fun i _ -> i >= len - n) lst 176 + 177 + (** Calculate the number of public suffix labels from a prevailing rule *) 178 + let suffix_label_count prevailing = 179 + if prevailing.is_exception then 180 + (* Exception rules: remove leftmost label from the rule *) 181 + prevailing.matched_labels - 1 182 + else 183 + prevailing.matched_labels 184 + 185 + (** Find the prevailing rule for a domain *) 186 + let find_prevailing_rule t labels = 187 + let rev_labels = List.rev labels in 188 + let matches = find_matches t.root rev_labels in 189 + select_prevailing_rule matches 190 + 182 191 let public_suffix_with_section t domain = 183 192 match normalize_domain domain with 184 193 | Error e -> Error e 185 194 | Ok (labels, has_trailing_dot) -> 186 - (* Reverse labels for trie traversal (TLD first) *) 187 - let rev_labels = List.rev labels in 188 - (* Find all matching rules *) 189 - let matches = find_matches t.root rev_labels in 190 - (* Select the prevailing rule *) 191 - let prevailing = select_prevailing_rule matches in 192 - (* Determine the number of suffix labels *) 193 - let suffix_label_count = 194 - if prevailing.is_exception then 195 - (* Exception rules: remove leftmost label from the rule *) 196 - prevailing.matched_labels - 1 197 - else 198 - prevailing.matched_labels 199 - in 200 - (* Extract the suffix labels from the domain *) 201 - let n = List.length labels in 202 - if suffix_label_count > n then 195 + let prevailing = find_prevailing_rule t labels in 196 + let count = suffix_label_count prevailing in 197 + if count > List.length labels then 203 198 Error No_public_suffix 204 - else begin 205 - let suffix_labels = 206 - (* Take the rightmost suffix_label_count labels *) 207 - let rec take_last n lst = 208 - if List.length lst <= n then lst 209 - else take_last n (List.tl lst) 210 - in 211 - take_last suffix_label_count labels 212 - in 199 + else 200 + let suffix_labels = take_last count labels in 213 201 let suffix = labels_to_domain suffix_labels has_trailing_dot in 214 202 Ok (suffix, prevailing.section) 215 - end 216 203 217 204 let public_suffix t domain = 218 - match public_suffix_with_section t domain with 219 - | Ok (suffix, _) -> Ok suffix 220 - | Error e -> Error e 205 + Result.map fst (public_suffix_with_section t domain) 221 206 222 207 let registrable_domain_with_section t domain = 223 208 match normalize_domain domain with 224 209 | Error e -> Error e 225 210 | Ok (labels, has_trailing_dot) -> 226 - (* Reverse labels for trie traversal (TLD first) *) 227 - let rev_labels = List.rev labels in 228 - (* Find all matching rules *) 229 - let matches = find_matches t.root rev_labels in 230 - (* Select the prevailing rule *) 231 - let prevailing = select_prevailing_rule matches in 232 - (* Determine the number of suffix labels *) 233 - let suffix_label_count = 234 - if prevailing.is_exception then 235 - prevailing.matched_labels - 1 236 - else 237 - prevailing.matched_labels 238 - in 239 - let n = List.length labels in 211 + let prevailing = find_prevailing_rule t labels in 212 + let count = suffix_label_count prevailing in 240 213 (* Registrable domain = suffix + 1 label *) 241 - let reg_label_count = suffix_label_count + 1 in 242 - if reg_label_count > n then 243 - (* Domain is a public suffix or shorter *) 214 + let reg_label_count = count + 1 in 215 + if reg_label_count > List.length labels then 244 216 Error Domain_is_public_suffix 245 - else begin 246 - let reg_labels = 247 - let rec take_last n lst = 248 - if List.length lst <= n then lst 249 - else take_last n (List.tl lst) 250 - in 251 - take_last reg_label_count labels 252 - in 217 + else 218 + let reg_labels = take_last reg_label_count labels in 253 219 let reg_domain = labels_to_domain reg_labels has_trailing_dot in 254 220 Ok (reg_domain, prevailing.section) 255 - end 256 221 257 222 let registrable_domain t domain = 258 - match registrable_domain_with_section t domain with 259 - | Ok (domain, _) -> Ok domain 260 - | Error e -> Error e 223 + Result.map fst (registrable_domain_with_section t domain) 261 224 262 225 let is_public_suffix t domain = 263 226 match normalize_domain domain with 264 227 | Error e -> Error e 265 228 | Ok (labels, _) -> 266 - let rev_labels = List.rev labels in 267 - let matches = find_matches t.root rev_labels in 268 - let prevailing = select_prevailing_rule matches in 269 - let suffix_label_count = 270 - if prevailing.is_exception then 271 - prevailing.matched_labels - 1 272 - else 273 - prevailing.matched_labels 274 - in 229 + let prevailing = find_prevailing_rule t labels in 230 + let count = suffix_label_count prevailing in 275 231 (* Domain is a public suffix if it has exactly suffix_label_count labels *) 276 - Ok (List.length labels = suffix_label_count) 232 + Ok (List.length labels = count) 277 233 278 234 let is_registrable_domain t domain = 279 235 match normalize_domain domain with 280 236 | Error e -> Error e 281 237 | Ok (labels, _) -> 282 - let rev_labels = List.rev labels in 283 - let matches = find_matches t.root rev_labels in 284 - let prevailing = select_prevailing_rule matches in 285 - let suffix_label_count = 286 - if prevailing.is_exception then 287 - prevailing.matched_labels - 1 288 - else 289 - prevailing.matched_labels 290 - in 291 - let reg_label_count = suffix_label_count + 1 in 238 + let prevailing = find_prevailing_rule t labels in 239 + let count = suffix_label_count prevailing in 240 + let reg_label_count = count + 1 in 292 241 (* Domain is registrable if it has exactly reg_label_count labels *) 293 242 Ok (List.length labels = reg_label_count) 294 243
+161
lib/publicsuffix_data.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Auto-generated Public Suffix List data 7 + 8 + This module contains the parsed and compiled Public Suffix List (PSL) data 9 + as OCaml data structures. The data is generated at build time from the 10 + official PSL file by the [gen_psl] code generator. 11 + 12 + {1 Public Suffix List Specification} 13 + 14 + The Public Suffix List is maintained by Mozilla and follows the specification 15 + at {{:https://publicsuffix.org/list/} publicsuffix.org}. The list provides 16 + an accurate database of domain name suffixes under which Internet users can 17 + directly register names. 18 + 19 + {2 PSL Format and Rules} 20 + 21 + The PSL defines three types of rules: 22 + 23 + - {b Normal rules}: Standard domain suffixes (e.g., [com], [co.uk]). 24 + These match exactly as written. 25 + 26 + - {b Wildcard rules}: Prefixed with [*.] (e.g., [*.jp]). These match any 27 + single label in that position. For example, [*.example.com] matches 28 + [foo.example.com] but not [example.com] or [bar.foo.example.com]. 29 + 30 + - {b Exception rules}: Prefixed with [!] (e.g., [!city.kobe.jp]). These 31 + override wildcard rules and specify that a particular domain {i is not} 32 + a public suffix despite a matching wildcard. 33 + 34 + {2 Sections} 35 + 36 + The PSL is divided into two sections: 37 + 38 + - {b ICANN section}: Contains domains delegated by ICANN or present in the 39 + IANA root zone database. These are official top-level domains and their 40 + subdivisions (e.g., [com], [co.uk], [k12.ak.us]). 41 + 42 + - {b Private section}: Contains domains submitted by private organizations 43 + for services that allow subdomain registration (e.g., [blogspot.com], 44 + [github.io], [herokuapp.com]). Applications may choose to treat these 45 + differently from ICANN domains. 46 + 47 + {2 Matching Algorithm} 48 + 49 + Per the PSL specification, the matching algorithm: 50 + 51 + 1. Matches the domain against all rules in the list 52 + 2. If no rules match, applies the implicit [*] wildcard rule 53 + 3. If multiple rules match, exception rules take priority 54 + 4. Otherwise, the rule with the most labels wins 55 + 5. For exception rules, the public suffix is derived by removing the 56 + exception's leftmost label 57 + 58 + {1 Data Structure} 59 + 60 + This module represents the PSL as a trie (prefix tree) data structure for 61 + efficient lookup. The trie is constructed with labels in reverse order 62 + (TLD first), allowing efficient traversal from the top-level domain down 63 + to more specific labels. 64 + 65 + All domain labels in the trie are: 66 + - Converted to lowercase 67 + - Encoded as Punycode for internationalized domain names 68 + - Stored as UTF-8 strings 69 + 70 + {1 Build-Time Generation} 71 + 72 + This module is automatically generated during the build process: 73 + 74 + 1. The [gen_psl.ml] code generator reads [public_suffix_list.dat] 75 + 2. It parses each rule according to the PSL specification 76 + 3. It constructs an in-memory trie from all rules 77 + 4. It emits OCaml source code representing the trie 78 + 5. The generated code is compiled into the library 79 + 80 + This approach embeds the entire PSL into the compiled library, requiring 81 + no runtime file I/O or parsing. 82 + 83 + {1 Interface} 84 + 85 + This module is internal to the library. The main library API is exposed 86 + through the {!Publicsuffix} module, which provides high-level functions 87 + for querying the PSL data. 88 + *) 89 + 90 + (** {1 Types} *) 91 + 92 + (** Section of the PSL where a rule originates. 93 + 94 + The PSL is divided into two sections with different governance: 95 + - [ICANN]: Official domains delegated by ICANN or in the IANA root zone 96 + - [Private]: Domains submitted by private parties for their services 97 + *) 98 + type section = ICANN | Private 99 + 100 + (** Rule types defined in the PSL specification. 101 + 102 + - [Normal]: A standard domain suffix that matches exactly (e.g., [com]) 103 + - [Wildcard]: A rule with [*.] prefix that matches any single label 104 + - [Exception]: A rule with [!] prefix that overrides wildcard matches 105 + *) 106 + type rule_type = Normal | Wildcard | Exception 107 + 108 + (** A node in the suffix trie. 109 + 110 + The trie is constructed with domain labels in reverse order (TLD first). 111 + For example, the domain [example.co.uk] would be traversed as [uk] -> [co] -> [example]. 112 + 113 + - [rule]: If [Some (rt, sec)], this node represents a PSL rule of type [rt] 114 + from section [sec] 115 + - [children]: List of (label, child_node) pairs for exact label matches 116 + - [wildcard_child]: If [Some node], this represents a wildcard match ([*]) 117 + at this position in the domain hierarchy 118 + *) 119 + type trie_node = { 120 + rule : (rule_type * section) option; 121 + children : (string * trie_node) list; 122 + wildcard_child : trie_node option; 123 + } 124 + 125 + (** {1 Data Access} *) 126 + 127 + (** Get the root of the suffix trie. 128 + 129 + The root node represents the starting point for all PSL lookups. Domain 130 + labels should be traversed in reverse order (TLD first) from this root. 131 + 132 + @return The root trie node containing all PSL rules 133 + *) 134 + val get_root : unit -> trie_node 135 + 136 + (** {1 Statistics} 137 + 138 + These values reflect the PSL data at the time this module was generated. 139 + They include all rules from both the ICANN and Private sections. 140 + *) 141 + 142 + (** Total number of rules in the embedded PSL data. 143 + 144 + This includes all Normal, Wildcard, and Exception rules from both 145 + sections. 146 + *) 147 + val rule_count : int 148 + 149 + (** Number of rules in the ICANN section. 150 + 151 + These are official TLD rules delegated by ICANN or present in the IANA 152 + root zone database. 153 + *) 154 + val icann_rule_count : int 155 + 156 + (** Number of rules in the Private section. 157 + 158 + These are rules submitted by private organizations for services that 159 + allow subdomain registration. 160 + *) 161 + val private_rule_count : int
+6 -8
test/psl_test.ml
··· 16 16 17 17 let psl = Publicsuffix.create () 18 18 19 + let print_error e = Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 20 + 19 21 let print_result = function 20 22 | Ok s -> print_endline s 21 - | Error e -> 22 - Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 23 + | Error e -> print_error e 23 24 24 25 let print_bool_result = function 25 - | Ok true -> print_endline "true" 26 - | Ok false -> print_endline "false" 27 - | Error e -> 28 - Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 26 + | Ok b -> print_endline (string_of_bool b) 27 + | Error e -> print_error e 29 28 30 29 let print_result_with_section = function 31 30 | Ok (s, sec) -> ··· 34 33 | Publicsuffix.Private -> "PRIVATE" 35 34 in 36 35 Printf.printf "%s (%s)\n" s sec_str 37 - | Error e -> 38 - Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e) 36 + | Error e -> print_error e 39 37 40 38 let () = 41 39 if Array.length Sys.argv < 2 then begin