···44444545(** Parse a single line from the PSL file *)
4646let parse_line section line =
4747- (* Strip comments *)
4747+ (* Strip comments (looking for //) *)
4848 let line =
4949 match String.index_opt line '/' with
5050- | Some i when i > 0 && line.[i-1] = '/' ->
5151- String.sub line 0 (i-1)
5050+ | Some i when i > 0 && line.[i-1] = '/' -> String.sub line 0 (i-1)
5251 | Some 0 -> ""
5352 | _ -> line
5453 in
5555- (* Take only up to first whitespace *)
5454+ (* Take only up to first whitespace and trim *)
5655 let line =
5757- let line = String.trim line in
5858- match String.index_opt line ' ' with
5959- | Some i -> String.sub line 0 i
6060- | None ->
6161- match String.index_opt line '\t' with
6262- | Some i -> String.sub line 0 i
6363- | None -> line
5656+ String.trim line
5757+ |> fun s ->
5858+ match String.index_from_opt s 0 ' ', String.index_from_opt s 0 '\t' with
5959+ | Some i, Some j -> String.sub s 0 (min i j)
6060+ | Some i, None | None, Some i -> String.sub s 0 i
6161+ | None, None -> s
6462 in
6563 let line = String.trim line in
6664 if line = "" then None
···7472 else
7573 (Normal, line)
7674 in
7777- (* Split into labels and reverse *)
7878- let labels = String.split_on_char '.' domain in
7979- let labels = List.rev labels in
8080- (* Filter out empty labels *)
8181- let labels = List.filter (fun s -> s <> "") labels in
8282- (* Convert each label to lowercase Punycode for canonical comparison *)
8383- let labels = List.map (fun label ->
8484- match Punycode.encode_label label with
8585- | Ok encoded -> String.lowercase_ascii encoded
8686- | Error _ -> String.lowercase_ascii label (* Fallback to lowercase *)
8787- ) labels in
7575+ (* Process labels: split, reverse, filter, and encode *)
7676+ let labels =
7777+ String.split_on_char '.' domain
7878+ |> List.rev
7979+ |> List.filter (fun s -> s <> "")
8080+ |> List.map (fun label ->
8181+ match Punycode.encode_label label with
8282+ | Ok encoded -> String.lowercase_ascii encoded
8383+ | Error _ -> String.lowercase_ascii label)
8484+ in
8885 if labels = [] then None
8986 else Some { labels; rule_type; section }
9087···129126 let rule_count = ref 0 in
130127 let icann_count = ref 0 in
131128 let private_count = ref 0 in
129129+ (* Helper to check if string contains substring *)
130130+ let contains_substring s sub =
131131+ try
132132+ let _ = Str.search_forward (Str.regexp_string sub) s 0 in
133133+ true
134134+ with Not_found -> false
135135+ in
132136 try
133137 while true do
134138 let line = input_line ic in
135135- (* Check for section markers - look for distinctive substrings *)
136136- let contains s sub =
137137- try
138138- let _ = Str.search_forward (Str.regexp_string sub) s 0 in true
139139- with Not_found -> false
140140- in
141141- if contains line "===BEGIN ICANN DOMAINS===" then
139139+ (* Check for section markers *)
140140+ if contains_substring line "===BEGIN ICANN DOMAINS===" then
142141 current_section := ICANN
143143- else if contains line "===BEGIN PRIVATE DOMAINS===" then
142142+ else if contains_substring line "===BEGIN PRIVATE DOMAINS===" then
144143 current_section := Private
145144 else
146146- match parse_line !current_section line with
147147- | None -> ()
148148- | Some rule ->
145145+ Option.iter (fun rule ->
149146 insert_rule trie rule;
150147 incr rule_count;
151148 if rule.section = ICANN then incr icann_count
152149 else incr private_count
150150+ ) (parse_line !current_section line)
153151 done;
154152 (trie, !rule_count, !icann_count, !private_count)
155153 with End_of_file ->
+68-119
lib/publicsuffix.ml
···5959(** Result of matching a domain against the trie *)
6060type match_result = {
6161 matched_labels : int; (* Number of labels matched *)
6262- rule_type : rule_type; (* Type of the matching rule *)
6362 section : section; (* Section of the rule *)
6463 is_exception : bool; (* Whether this is an exception rule *)
6564}
···7271 (* Track whether we matched the implicit * rule *)
7372 let implicit_match = {
7473 matched_labels = 1;
7575- rule_type = Wildcard;
7674 section = ICANN; (* Implicit rule is considered ICANN *)
7775 is_exception = false;
7876 } in
79778078 let rec traverse (node : trie_node) depth remaining_labels =
8179 (* Check if current node has a rule *)
8282- (match node.rule with
8383- | Some (rt, sec) ->
8484- let m = {
8585- matched_labels = depth;
8686- rule_type = rt;
8787- section = sec;
8888- is_exception = (rt = Exception);
8989- } in
9090- matches := m :: !matches
9191- | None -> ());
8080+ Option.iter (fun (rt, sec) ->
8181+ let m = {
8282+ matched_labels = depth;
8383+ section = sec;
8484+ is_exception = (rt = Exception);
8585+ } in
8686+ matches := m :: !matches
8787+ ) node.rule;
92889389 (* Continue traversing if we have more labels *)
9490 match remaining_labels with
9591 | [] -> ()
9692 | label :: rest ->
9793 (* Check for wildcard match *)
9898- (match node.wildcard_child with
9999- | Some wc ->
100100- (match wc.rule with
101101- | Some (rt, sec) ->
102102- let m = {
103103- matched_labels = depth + 1;
104104- rule_type = rt;
105105- section = sec;
106106- is_exception = (rt = Exception);
107107- } in
108108- matches := m :: !matches
109109- | None -> ())
110110- | None -> ());
9494+ node.wildcard_child
9595+ |> Option.iter (fun wc ->
9696+ Option.iter (fun (rt, sec) ->
9797+ let m = {
9898+ matched_labels = depth + 1;
9999+ section = sec;
100100+ is_exception = (rt = Exception);
101101+ } in
102102+ matches := m :: !matches
103103+ ) wc.rule);
111104112105 (* Check for exact label match *)
113113- match find_child node label with
114114- | Some child -> traverse child (depth + 1) rest
115115- | None -> ()
106106+ find_child node label
107107+ |> Option.iter (fun child -> traverse child (depth + 1) rest)
116108 in
117109118110 traverse root 0 labels;
···127119 2. Otherwise, the rule with the most labels wins
128120*)
129121let select_prevailing_rule matches =
130130- (* First, check for exception rules *)
131131- let exceptions = List.filter (fun m -> m.is_exception) matches in
132132- match exceptions with
133133- | ex :: _ -> ex (* Exception rules take priority *)
134134- | [] ->
122122+ match List.find_opt (fun m -> m.is_exception) matches with
123123+ | Some ex -> ex (* Exception rules take priority *)
124124+ | None ->
135125 (* Find the rule with the most labels *)
136126 List.fold_left (fun best m ->
137127 if m.matched_labels > best.matched_labels then m else best
···146136let normalize_domain domain =
147137 if domain = "" then Error Empty_domain
148138 else if String.length domain > 0 && domain.[0] = '.' then Error Leading_dot
149149- else begin
139139+ else
150140 (* Check for and preserve trailing dot *)
151141 let has_trailing_dot =
152142 String.length domain > 0 && domain.[String.length domain - 1] = '.'
···164154 let msg = Format.asprintf "%a" Punycode_idna.pp_error e in
165155 Error (Punycode_error msg)
166156 | Ok ascii_domain ->
167167- (* Convert to lowercase *)
157157+ (* Convert to lowercase and split into labels *)
168158 let ascii_lower = String.lowercase_ascii ascii_domain in
169169- (* Split into labels *)
170170- let labels = String.split_on_char '.' ascii_lower in
171171- (* Filter empty labels (shouldn't happen after normalization) *)
172172- let labels = List.filter (fun s -> s <> "") labels in
159159+ let labels =
160160+ String.split_on_char '.' ascii_lower
161161+ |> List.filter (fun s -> s <> "")
162162+ in
173163 if labels = [] then Error Empty_domain
174164 else Ok (labels, has_trailing_dot)
175175- end
176165177166(** Convert labels back to a domain string *)
178167let labels_to_domain labels has_trailing_dot =
179168 let domain = String.concat "." labels in
180169 if has_trailing_dot then domain ^ "." else domain
181170171171+(** Take the rightmost n elements from a list *)
172172+let take_last n lst =
173173+ let len = List.length lst in
174174+ if len <= n then lst
175175+ else List.filteri (fun i _ -> i >= len - n) lst
176176+177177+(** Calculate the number of public suffix labels from a prevailing rule *)
178178+let suffix_label_count prevailing =
179179+ if prevailing.is_exception then
180180+ (* Exception rules: remove leftmost label from the rule *)
181181+ prevailing.matched_labels - 1
182182+ else
183183+ prevailing.matched_labels
184184+185185+(** Find the prevailing rule for a domain *)
186186+let find_prevailing_rule t labels =
187187+ let rev_labels = List.rev labels in
188188+ let matches = find_matches t.root rev_labels in
189189+ select_prevailing_rule matches
190190+182191let public_suffix_with_section t domain =
183192 match normalize_domain domain with
184193 | Error e -> Error e
185194 | Ok (labels, has_trailing_dot) ->
186186- (* Reverse labels for trie traversal (TLD first) *)
187187- let rev_labels = List.rev labels in
188188- (* Find all matching rules *)
189189- let matches = find_matches t.root rev_labels in
190190- (* Select the prevailing rule *)
191191- let prevailing = select_prevailing_rule matches in
192192- (* Determine the number of suffix labels *)
193193- let suffix_label_count =
194194- if prevailing.is_exception then
195195- (* Exception rules: remove leftmost label from the rule *)
196196- prevailing.matched_labels - 1
197197- else
198198- prevailing.matched_labels
199199- in
200200- (* Extract the suffix labels from the domain *)
201201- let n = List.length labels in
202202- if suffix_label_count > n then
195195+ let prevailing = find_prevailing_rule t labels in
196196+ let count = suffix_label_count prevailing in
197197+ if count > List.length labels then
203198 Error No_public_suffix
204204- else begin
205205- let suffix_labels =
206206- (* Take the rightmost suffix_label_count labels *)
207207- let rec take_last n lst =
208208- if List.length lst <= n then lst
209209- else take_last n (List.tl lst)
210210- in
211211- take_last suffix_label_count labels
212212- in
199199+ else
200200+ let suffix_labels = take_last count labels in
213201 let suffix = labels_to_domain suffix_labels has_trailing_dot in
214202 Ok (suffix, prevailing.section)
215215- end
216203217204let public_suffix t domain =
218218- match public_suffix_with_section t domain with
219219- | Ok (suffix, _) -> Ok suffix
220220- | Error e -> Error e
205205+ Result.map fst (public_suffix_with_section t domain)
221206222207let registrable_domain_with_section t domain =
223208 match normalize_domain domain with
224209 | Error e -> Error e
225210 | Ok (labels, has_trailing_dot) ->
226226- (* Reverse labels for trie traversal (TLD first) *)
227227- let rev_labels = List.rev labels in
228228- (* Find all matching rules *)
229229- let matches = find_matches t.root rev_labels in
230230- (* Select the prevailing rule *)
231231- let prevailing = select_prevailing_rule matches in
232232- (* Determine the number of suffix labels *)
233233- let suffix_label_count =
234234- if prevailing.is_exception then
235235- prevailing.matched_labels - 1
236236- else
237237- prevailing.matched_labels
238238- in
239239- let n = List.length labels in
211211+ let prevailing = find_prevailing_rule t labels in
212212+ let count = suffix_label_count prevailing in
240213 (* Registrable domain = suffix + 1 label *)
241241- let reg_label_count = suffix_label_count + 1 in
242242- if reg_label_count > n then
243243- (* Domain is a public suffix or shorter *)
214214+ let reg_label_count = count + 1 in
215215+ if reg_label_count > List.length labels then
244216 Error Domain_is_public_suffix
245245- else begin
246246- let reg_labels =
247247- let rec take_last n lst =
248248- if List.length lst <= n then lst
249249- else take_last n (List.tl lst)
250250- in
251251- take_last reg_label_count labels
252252- in
217217+ else
218218+ let reg_labels = take_last reg_label_count labels in
253219 let reg_domain = labels_to_domain reg_labels has_trailing_dot in
254220 Ok (reg_domain, prevailing.section)
255255- end
256221257222let registrable_domain t domain =
258258- match registrable_domain_with_section t domain with
259259- | Ok (domain, _) -> Ok domain
260260- | Error e -> Error e
223223+ Result.map fst (registrable_domain_with_section t domain)
261224262225let is_public_suffix t domain =
263226 match normalize_domain domain with
264227 | Error e -> Error e
265228 | Ok (labels, _) ->
266266- let rev_labels = List.rev labels in
267267- let matches = find_matches t.root rev_labels in
268268- let prevailing = select_prevailing_rule matches in
269269- let suffix_label_count =
270270- if prevailing.is_exception then
271271- prevailing.matched_labels - 1
272272- else
273273- prevailing.matched_labels
274274- in
229229+ let prevailing = find_prevailing_rule t labels in
230230+ let count = suffix_label_count prevailing in
275231 (* Domain is a public suffix if it has exactly suffix_label_count labels *)
276276- Ok (List.length labels = suffix_label_count)
232232+ Ok (List.length labels = count)
277233278234let is_registrable_domain t domain =
279235 match normalize_domain domain with
280236 | Error e -> Error e
281237 | Ok (labels, _) ->
282282- let rev_labels = List.rev labels in
283283- let matches = find_matches t.root rev_labels in
284284- let prevailing = select_prevailing_rule matches in
285285- let suffix_label_count =
286286- if prevailing.is_exception then
287287- prevailing.matched_labels - 1
288288- else
289289- prevailing.matched_labels
290290- in
291291- let reg_label_count = suffix_label_count + 1 in
238238+ let prevailing = find_prevailing_rule t labels in
239239+ let count = suffix_label_count prevailing in
240240+ let reg_label_count = count + 1 in
292241 (* Domain is registrable if it has exactly reg_label_count labels *)
293242 Ok (List.length labels = reg_label_count)
294243
+161
lib/publicsuffix_data.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(** Auto-generated Public Suffix List data
77+88+ This module contains the parsed and compiled Public Suffix List (PSL) data
99+ as OCaml data structures. The data is generated at build time from the
1010+ official PSL file by the [gen_psl] code generator.
1111+1212+ {1 Public Suffix List Specification}
1313+1414+ The Public Suffix List is maintained by Mozilla and follows the specification
1515+ at {{:https://publicsuffix.org/list/} publicsuffix.org}. The list provides
1616+ an accurate database of domain name suffixes under which Internet users can
1717+ directly register names.
1818+1919+ {2 PSL Format and Rules}
2020+2121+ The PSL defines three types of rules:
2222+2323+ - {b Normal rules}: Standard domain suffixes (e.g., [com], [co.uk]).
2424+ These match exactly as written.
2525+2626+ - {b Wildcard rules}: Prefixed with [*.] (e.g., [*.jp]). These match any
2727+ single label in that position. For example, [*.example.com] matches
2828+ [foo.example.com] but not [example.com] or [bar.foo.example.com].
2929+3030+ - {b Exception rules}: Prefixed with [!] (e.g., [!city.kobe.jp]). These
3131+ override wildcard rules and specify that a particular domain {i is not}
3232+ a public suffix despite a matching wildcard.
3333+3434+ {2 Sections}
3535+3636+ The PSL is divided into two sections:
3737+3838+ - {b ICANN section}: Contains domains delegated by ICANN or present in the
3939+ IANA root zone database. These are official top-level domains and their
4040+ subdivisions (e.g., [com], [co.uk], [k12.ak.us]).
4141+4242+ - {b Private section}: Contains domains submitted by private organizations
4343+ for services that allow subdomain registration (e.g., [blogspot.com],
4444+ [github.io], [herokuapp.com]). Applications may choose to treat these
4545+ differently from ICANN domains.
4646+4747+ {2 Matching Algorithm}
4848+4949+ Per the PSL specification, the matching algorithm:
5050+5151+ 1. Matches the domain against all rules in the list
5252+ 2. If no rules match, applies the implicit [*] wildcard rule
5353+ 3. If multiple rules match, exception rules take priority
5454+ 4. Otherwise, the rule with the most labels wins
5555+ 5. For exception rules, the public suffix is derived by removing the
5656+ exception's leftmost label
5757+5858+ {1 Data Structure}
5959+6060+ This module represents the PSL as a trie (prefix tree) data structure for
6161+ efficient lookup. The trie is constructed with labels in reverse order
6262+ (TLD first), allowing efficient traversal from the top-level domain down
6363+ to more specific labels.
6464+6565+ All domain labels in the trie are:
6666+ - Converted to lowercase
6767+ - Encoded as Punycode for internationalized domain names
6868+ - Stored as UTF-8 strings
6969+7070+ {1 Build-Time Generation}
7171+7272+ This module is automatically generated during the build process:
7373+7474+ 1. The [gen_psl.ml] code generator reads [public_suffix_list.dat]
7575+ 2. It parses each rule according to the PSL specification
7676+ 3. It constructs an in-memory trie from all rules
7777+ 4. It emits OCaml source code representing the trie
7878+ 5. The generated code is compiled into the library
7979+8080+ This approach embeds the entire PSL into the compiled library, requiring
8181+ no runtime file I/O or parsing.
8282+8383+ {1 Interface}
8484+8585+ This module is internal to the library. The main library API is exposed
8686+ through the {!Publicsuffix} module, which provides high-level functions
8787+ for querying the PSL data.
8888+*)
8989+9090+(** {1 Types} *)
9191+9292+(** Section of the PSL where a rule originates.
9393+9494+ The PSL is divided into two sections with different governance:
9595+ - [ICANN]: Official domains delegated by ICANN or in the IANA root zone
9696+ - [Private]: Domains submitted by private parties for their services
9797+*)
9898+type section = ICANN | Private
9999+100100+(** Rule types defined in the PSL specification.
101101+102102+ - [Normal]: A standard domain suffix that matches exactly (e.g., [com])
103103+ - [Wildcard]: A rule with [*.] prefix that matches any single label
104104+ - [Exception]: A rule with [!] prefix that overrides wildcard matches
105105+*)
106106+type rule_type = Normal | Wildcard | Exception
107107+108108+(** A node in the suffix trie.
109109+110110+ The trie is constructed with domain labels in reverse order (TLD first).
111111+ For example, the domain [example.co.uk] would be traversed as [uk] -> [co] -> [example].
112112+113113+ - [rule]: If [Some (rt, sec)], this node represents a PSL rule of type [rt]
114114+ from section [sec]
115115+ - [children]: List of (label, child_node) pairs for exact label matches
116116+ - [wildcard_child]: If [Some node], this represents a wildcard match ([*])
117117+ at this position in the domain hierarchy
118118+*)
119119+type trie_node = {
120120+ rule : (rule_type * section) option;
121121+ children : (string * trie_node) list;
122122+ wildcard_child : trie_node option;
123123+}
124124+125125+(** {1 Data Access} *)
126126+127127+(** Get the root of the suffix trie.
128128+129129+ The root node represents the starting point for all PSL lookups. Domain
130130+ labels should be traversed in reverse order (TLD first) from this root.
131131+132132+ @return The root trie node containing all PSL rules
133133+*)
134134+val get_root : unit -> trie_node
135135+136136+(** {1 Statistics}
137137+138138+ These values reflect the PSL data at the time this module was generated.
139139+ They include all rules from both the ICANN and Private sections.
140140+*)
141141+142142+(** Total number of rules in the embedded PSL data.
143143+144144+ This includes all Normal, Wildcard, and Exception rules from both
145145+ sections.
146146+*)
147147+val rule_count : int
148148+149149+(** Number of rules in the ICANN section.
150150+151151+ These are official TLD rules delegated by ICANN or present in the IANA
152152+ root zone database.
153153+*)
154154+val icann_rule_count : int
155155+156156+(** Number of rules in the Private section.
157157+158158+ These are rules submitted by private organizations for services that
159159+ allow subdomain registration.
160160+*)
161161+val private_rule_count : int
+6-8
test/psl_test.ml
···16161717let psl = Publicsuffix.create ()
18181919+let print_error e = Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e)
2020+1921let print_result = function
2022 | Ok s -> print_endline s
2121- | Error e ->
2222- Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e)
2323+ | Error e -> print_error e
23242425let print_bool_result = function
2525- | Ok true -> print_endline "true"
2626- | Ok false -> print_endline "false"
2727- | Error e ->
2828- Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e)
2626+ | Ok b -> print_endline (string_of_bool b)
2727+ | Error e -> print_error e
29283029let print_result_with_section = function
3130 | Ok (s, sec) ->
···3433 | Publicsuffix.Private -> "PRIVATE"
3534 in
3635 Printf.printf "%s (%s)\n" s sec_str
3737- | Error e ->
3838- Printf.printf "ERROR: %s\n" (Publicsuffix.error_to_string e)
3636+ | Error e -> print_error e
39374038let () =
4139 if Array.length Sys.argv < 2 then begin