···11-(* roguedoi.ml - Generate random DOI identifiers with Crockford base32 encoding *)
11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
2536let generate_doi prefix length split =
47 Random.self_init ();
58 let suffix = Crockford.generate ~length ~split_every:split ~checksum:true () in
66- Printf.printf "https://doi.org/%s/%s\n%!" prefix suffix
99+ Printf.printf "%s/%s\n%!" prefix suffix
710811let () =
912 let open Cmdliner in
···25282629 let generate_cmd =
2730 let doc = "Generate a random DOI with Crockford base32 encoding" in
2828- let info = Cmd.info "roguedoi" ~version:"0.1.0" ~doc in
3131+ let info = Cmd.info "roguedoi" ~version:"1.0.0" ~doc in
2932 Cmd.v info Term.(const generate_doi $ prefix $ length $ split)
3033 in
3134
+2-2
lib/crockford.ml
···166166167167 !number
168168169169-let generate ~length ?(split_every=0) ?(checksum=false) () =
169169+let generate ~length ?(split_every=0) ?(checksum=false) ?(rng=Random.float) () =
170170 if checksum && length < 3 then
171171 raise (Decode_error (Invalid_length {
172172 length;
···177177178178 (* Generate random number between 0 and 32^length *)
179179 let max_val = 32.0 ** float_of_int adjusted_length in
180180- let random_num = Int64.of_float (Random.float max_val) in
180180+ let random_num = Int64.of_float (rng max_val) in
181181182182 encode ~split_every ~min_length:adjusted_length ~checksum random_num
+221-25
lib/crockford.mli
···33 SPDX-License-Identifier: MIT
44 ---------------------------------------------------------------------------*)
5566-(** Crockford Base32 encoding for OCaml *)
66+(** Crockford Base32 encoding for OCaml
77+88+ {1 Overview}
99+1010+ Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
1111+ for human-readable identifiers. It is particularly well-suited for use in URLs,
1212+ user-facing identifiers, and systems where humans need to transcribe or
1313+ communicate encoded values. It features:
1414+1515+ {ul
1616+ {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
1717+ letters that are easily confused: I, L, O, and U. This prevents common
1818+ transcription errors.}
1919+ {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
2020+ during decoding, making it forgiving of human input.}
2121+ {- {b Confusable character mapping}: When decoding, the letters I and L are
2222+ automatically mapped to 1, and O is mapped to 0, further reducing
2323+ transcription errors.}
2424+ {- {b Hyphenation support}: Hyphens can be included for readability and are
2525+ automatically ignored during decoding.}
2626+ {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
2727+ transcription errors. The checksum is encoded as two additional characters.}
2828+ {- {b URL-safe}: All characters in the encoding are safe for use in URLs
2929+ without escaping.}
3030+ }
3131+3232+ {2 The Encoding Alphabet}
3333+3434+ The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
3535+3636+ Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
3737+ respectively.
3838+3939+ {2 Comparison with Other Encodings}
4040+4141+ {ul
4242+ {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
4343+ character set and case-insensitivity, though it produces slightly longer
4444+ strings (base32 uses 5 bits per character vs base64's 6 bits).}
4545+ {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
4646+ only 4 bits per character) and includes more letters for better distribution.}
4747+ {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
4848+ optimized for human readability with its character mappings and exclusions.}
4949+ }
5050+5151+ {1 Examples}
5252+5353+ {[
5454+ (* Basic encoding *)
5555+ let id = encode 123456789L;;
5656+ (* Result: "3rv5k1" *)
5757+5858+ (* Encoding with hyphenation for readability *)
5959+ let id = encode ~split_every:4 123456789L;;
6060+ (* Result: "3rv5-k1" *)
6161+6262+ (* Encoding with checksum for error detection *)
6363+ let id = encode ~checksum:true 123456789L;;
6464+ (* Result: "3rv5k187" (last two digits are checksum) *)
6565+6666+ (* Generate a random 8-character identifier *)
6767+ Random.self_init ();;
6868+ let random_id = generate ~length:8 ();;
6969+ (* Result: something like "n4g9k2c7" *)
7070+7171+ (* Generate with checksum and hyphenation *)
7272+ let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
7373+ (* Result: something like "a3k2x-9m4c82" *)
7474+7575+ (* Decoding is case-insensitive and ignores hyphens *)
7676+ let n = decode "3RV5-K1";;
7777+ (* Result: 123456789L *)
7878+7979+ (* Decode with checksum validation *)
8080+ let n = decode ~checksum:true "3rv5k187";;
8181+ (* Result: 123456789L (or raises Decode_error if checksum invalid) *)
8282+ ]}
8383+8484+ {1 API Documentation} *)
8585+8686+(** {1 ID Generation}
8787+8888+ Generate random identifiers in Crockford base32 format. This is useful for
8989+ creating unique, human-readable IDs for databases, URLs, or user-facing
9090+ reference numbers. *)
9191+9292+val generate :
9393+ length:int ->
9494+ ?split_every:int ->
9595+ ?checksum:bool ->
9696+ ?rng:(float -> float) ->
9797+ unit -> string
9898+(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
9999+100100+ This function creates a random identifier by generating a random integer and
101101+ encoding it using the Crockford base32 alphabet. The generated IDs are suitable
102102+ for use as database keys, URL-safe identifiers, or user-visible reference numbers.
103103+104104+ When using the default [Random.float] generator, you must initialize the
105105+ random number generator with {!Random.self_init} before calling this function.
106106+107107+ @param length The target length of the generated string. When [checksum:false],
108108+ this is the exact output length. When [checksum:true], this is the
109109+ total length including the 2-character checksum, so the random
110110+ portion will be [length - 2] characters.
111111+ @param split_every Insert hyphens every N characters for improved readability.
112112+ For example, [split_every]=[4] might produce [3a7k-m9n2].
113113+ Default: no splitting.
114114+ @param checksum Append a 2-character ISO 7064 checksum for error detection.
115115+ Useful when IDs will be manually transcribed. When [true],
116116+ the total output length (including checksum) will be [length].
117117+ Default: [false].
118118+ @param rng Custom random number generator function that takes a float bound and
119119+ returns a random float in the range [0,bound]. This allows for
120120+ deterministic testing or custom entropy sources. Defaults to using {!Random.float}.
121121+ @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
122122+ as at least 1 character is needed for the ID and 2 for the checksum. *)
123123+71248125(** {1 Error Types} *)
9126···47164(** {1 Constants} *)
4816549166val encoding_chars : string
5050-(** The Crockford base32 encoding alphabet (excludes i, l, o, u) *)
167167+(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
511685252-(** {1 Encoding and Decoding} *)
169169+ This 32-character alphabet excludes I, L, O, and U to prevent confusion with
170170+ visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
171171+ for decoding but returned in lowercase by encoding functions. *)
172172+173173+(** {1 Encoding and Decoding}
174174+175175+ The core encoding and decoding functions convert between 64-bit integers and
176176+ their Crockford base32 string representations. *)
5317754178val encode :
55179 ?split_every:int ->
···57181 ?checksum:bool ->
58182 int64 -> string
59183(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
6060- @param split_every Split the output with '-' every n characters (default: no splitting)
6161- @param min_length Pad with zeros to this minimum length (default: no padding)
6262- @param checksum Append ISO 7064 checksum as 2 digits (default: false) *)
184184+185185+ The function converts a 64-bit integer into a base32 representation using the
186186+ Crockford alphabet. The encoding process divides the number by 32 repeatedly,
187187+ using the remainder as an index into the alphabet.
188188+189189+ @param split_every Insert hyphens every N characters for readability. For example,
190190+ [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
191191+ are ignored during decoding. Default: no splitting.
192192+ @param min_length Pad the output with leading zeros to reach this minimum length.
193193+ When [checksum:true], the minimum length includes the 2-character
194194+ checksum. Default: no padding.
195195+ @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
196196+ errors. The checksum is computed on the original number and encoded
197197+ as two additional base32 characters. Default: [false].
198198+199199+ {b Examples:}
200200+ {[
201201+ encode 0L;; (* "0" *)
202202+ encode 1234L;; (* "16j" *)
203203+ encode ~min_length:6 1234L;; (* "00016j" *)
204204+ encode ~split_every:3 123456L;; (* "3rv-5k" *)
205205+ encode ~checksum:true 1234L;; (* "16j48" *)
206206+ encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
207207+ ]} *)
6320864209val decode : ?checksum:bool -> string -> int64
65210(** [decode ?checksum str] decodes a Crockford base32 string to int64.
6666- @param checksum Expect and validate ISO 7064 checksum (default: false)
6767- @raise Decode_error if decoding fails (invalid characters, invalid checksum format, or checksum mismatch) *)
682116969-(** {1 ID Generation} *)
212212+ The function is designed to be forgiving of human input:
213213+ - Case-insensitive: accepts both uppercase and lowercase letters
214214+ - Strips hyphens automatically
215215+ - Maps confusable characters: I/i and L/l → 1, O/o → 0
216216+217217+ @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
218218+ If [true], the function verifies that the checksum matches the
219219+ decoded value. Default: [false].
220220+221221+ @raise Decode_error with one of the following variants:
222222+ - [Invalid_character] if an unrecognized character is encountered
223223+ - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
224224+ - [Checksum_mismatch] if the checksum doesn't match the decoded value
702257171-val generate :
7272- length:int ->
7373- ?split_every:int ->
7474- ?checksum:bool ->
7575- unit -> string
7676-(** [generate ~length ?split_every ?checksum ()] generates a random Crockford base32 string.
7777- @param length The length of the generated string (excluding checksum)
7878- @param split_every Split the output with '-' every n characters (default: no splitting)
7979- @param checksum Append ISO 7064 checksum as 2 digits (default: false)
8080- @raise Decode_error if checksum is true and length < 3
226226+ {b Examples:}
227227+ {[
228228+ decode "16j";; (* 1234L *)
229229+ decode "16J";; (* 1234L - case insensitive *)
230230+ decode "1-6-j";; (* 1234L - hyphens ignored *)
231231+ decode "I6j";; (* 1234L - 'I' mapped to '1' *)
232232+ decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
233233+ ]} *)
812348282- Note: Caller must initialize Random module with {!Random.self_init} before use *)
235235+(** {1 Utility Functions}
832368484-(** {1 Utility Functions} *)
237237+ Low-level functions for working with Crockford base32 strings and checksums. *)
8523886239val normalize : string -> string
8787-(** [normalize str] normalizes a string for decoding by converting to lowercase,
8888- removing dashes, and mapping confusable characters (i→1, l→1, o→0) *)
240240+(** [normalize str] normalizes a string for decoding.
241241+242242+ This function prepares a potentially messy human input string for decoding by:
243243+ - Converting all characters to lowercase
244244+ - Removing all hyphens ([-])
245245+ - Mapping confusable characters: [I] and [L] → [1], [O] → [0]
246246+247247+ This is automatically called by {!decode}, but is exposed for cases where
248248+ you want to normalize strings before storage or comparison.
249249+250250+ {b Examples:}
251251+ {[
252252+ normalize "ABC-123";; (* "abc123" *)
253253+ normalize "IlO";; (* "110" - confusables mapped *)
254254+ normalize "A-B-C";; (* "abc" - hyphens removed *)
255255+ normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
256256+ ]} *)
8925790258val validate : int64 -> checksum:int64 -> bool
9191-(** [validate n ~checksum] validates that a checksum matches the number *)
259259+(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
260260+261261+ This function computes the ISO 7064 (mod 97-10) checksum for the given number
262262+ and compares it with the provided checksum value.
263263+264264+ @param n The integer value to validate
265265+ @param checksum The expected checksum value (0-96)
266266+ @return [true] if the checksum is valid, [false] otherwise
267267+268268+ {b Examples:}
269269+ {[
270270+ let cs = generate_checksum 1234L in
271271+ validate 1234L ~checksum:cs;; (* true *)
272272+ validate 1234L ~checksum:99L;; (* false *)
273273+ ]} *)
9227493275val generate_checksum : int64 -> int64
9494-(** [generate_checksum n] generates an ISO 7064 (mod 97-10) checksum for a number *)
276276+(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
277277+278278+ The ISO 7064 algorithm provides a checksum that can detect:
279279+ - All single-digit errors
280280+ - Most adjacent transposition errors
281281+ - Most twin errors (where two identical digits are replaced by two other identical digits)
282282+283283+ The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
284284+285285+ {b Examples:}
286286+ {[
287287+ generate_checksum 0L;; (* 1L *)
288288+ generate_checksum 1234L;; (* 48L *)
289289+ generate_checksum 123456L;; (* 87L *)
290290+ ]} *)