OCaml library for Crockford's Base32

+ocamlformat and docs

+230 -30
+1
.ocamlformat
··· 1 + version=0.27.0
+6 -3
bin/roguedoi.ml
··· 1 - (* roguedoi.ml - Generate random DOI identifiers with Crockford base32 encoding *) 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 2 5 3 6 let generate_doi prefix length split = 4 7 Random.self_init (); 5 8 let suffix = Crockford.generate ~length ~split_every:split ~checksum:true () in 6 - Printf.printf "https://doi.org/%s/%s\n%!" prefix suffix 9 + Printf.printf "%s/%s\n%!" prefix suffix 7 10 8 11 let () = 9 12 let open Cmdliner in ··· 25 28 26 29 let generate_cmd = 27 30 let doc = "Generate a random DOI with Crockford base32 encoding" in 28 - let info = Cmd.info "roguedoi" ~version:"0.1.0" ~doc in 31 + let info = Cmd.info "roguedoi" ~version:"1.0.0" ~doc in 29 32 Cmd.v info Term.(const generate_doi $ prefix $ length $ split) 30 33 in 31 34
+2 -2
lib/crockford.ml
··· 166 166 167 167 !number 168 168 169 - let generate ~length ?(split_every=0) ?(checksum=false) () = 169 + let generate ~length ?(split_every=0) ?(checksum=false) ?(rng=Random.float) () = 170 170 if checksum && length < 3 then 171 171 raise (Decode_error (Invalid_length { 172 172 length; ··· 177 177 178 178 (* Generate random number between 0 and 32^length *) 179 179 let max_val = 32.0 ** float_of_int adjusted_length in 180 - let random_num = Int64.of_float (Random.float max_val) in 180 + let random_num = Int64.of_float (rng max_val) in 181 181 182 182 encode ~split_every ~min_length:adjusted_length ~checksum random_num
+221 -25
lib/crockford.mli
··· 3 3 SPDX-License-Identifier: MIT 4 4 ---------------------------------------------------------------------------*) 5 5 6 - (** Crockford Base32 encoding for OCaml *) 6 + (** Crockford Base32 encoding for OCaml 7 + 8 + {1 Overview} 9 + 10 + Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford 11 + for human-readable identifiers. It is particularly well-suited for use in URLs, 12 + user-facing identifiers, and systems where humans need to transcribe or 13 + communicate encoded values. It features: 14 + 15 + {ul 16 + {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes 17 + letters that are easily confused: I, L, O, and U. This prevents common 18 + transcription errors.} 19 + {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted 20 + during decoding, making it forgiving of human input.} 21 + {- {b Confusable character mapping}: When decoding, the letters I and L are 22 + automatically mapped to 1, and O is mapped to 0, further reducing 23 + transcription errors.} 24 + {- {b Hyphenation support}: Hyphens can be included for readability and are 25 + automatically ignored during decoding.} 26 + {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect 27 + transcription errors. The checksum is encoded as two additional characters.} 28 + {- {b URL-safe}: All characters in the encoding are safe for use in URLs 29 + without escaping.} 30 + } 31 + 32 + {2 The Encoding Alphabet} 33 + 34 + The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ] 35 + 36 + Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V 37 + respectively. 38 + 39 + {2 Comparison with Other Encodings} 40 + 41 + {ul 42 + {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced 43 + character set and case-insensitivity, though it produces slightly longer 44 + strings (base32 uses 5 bits per character vs base64's 6 bits).} 45 + {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses 46 + only 4 bits per character) and includes more letters for better distribution.} 47 + {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically 48 + optimized for human readability with its character mappings and exclusions.} 49 + } 50 + 51 + {1 Examples} 52 + 53 + {[ 54 + (* Basic encoding *) 55 + let id = encode 123456789L;; 56 + (* Result: "3rv5k1" *) 57 + 58 + (* Encoding with hyphenation for readability *) 59 + let id = encode ~split_every:4 123456789L;; 60 + (* Result: "3rv5-k1" *) 61 + 62 + (* Encoding with checksum for error detection *) 63 + let id = encode ~checksum:true 123456789L;; 64 + (* Result: "3rv5k187" (last two digits are checksum) *) 65 + 66 + (* Generate a random 8-character identifier *) 67 + Random.self_init ();; 68 + let random_id = generate ~length:8 ();; 69 + (* Result: something like "n4g9k2c7" *) 70 + 71 + (* Generate with checksum and hyphenation *) 72 + let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();; 73 + (* Result: something like "a3k2x-9m4c82" *) 74 + 75 + (* Decoding is case-insensitive and ignores hyphens *) 76 + let n = decode "3RV5-K1";; 77 + (* Result: 123456789L *) 78 + 79 + (* Decode with checksum validation *) 80 + let n = decode ~checksum:true "3rv5k187";; 81 + (* Result: 123456789L (or raises Decode_error if checksum invalid) *) 82 + ]} 83 + 84 + {1 API Documentation} *) 85 + 86 + (** {1 ID Generation} 87 + 88 + Generate random identifiers in Crockford base32 format. This is useful for 89 + creating unique, human-readable IDs for databases, URLs, or user-facing 90 + reference numbers. *) 91 + 92 + val generate : 93 + length:int -> 94 + ?split_every:int -> 95 + ?checksum:bool -> 96 + ?rng:(float -> float) -> 97 + unit -> string 98 + (** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string. 99 + 100 + This function creates a random identifier by generating a random integer and 101 + encoding it using the Crockford base32 alphabet. The generated IDs are suitable 102 + for use as database keys, URL-safe identifiers, or user-visible reference numbers. 103 + 104 + When using the default [Random.float] generator, you must initialize the 105 + random number generator with {!Random.self_init} before calling this function. 106 + 107 + @param length The target length of the generated string. When [checksum:false], 108 + this is the exact output length. When [checksum:true], this is the 109 + total length including the 2-character checksum, so the random 110 + portion will be [length - 2] characters. 111 + @param split_every Insert hyphens every N characters for improved readability. 112 + For example, [split_every]=[4] might produce [3a7k-m9n2]. 113 + Default: no splitting. 114 + @param checksum Append a 2-character ISO 7064 checksum for error detection. 115 + Useful when IDs will be manually transcribed. When [true], 116 + the total output length (including checksum) will be [length]. 117 + Default: [false]. 118 + @param rng Custom random number generator function that takes a float bound and 119 + returns a random float in the range [0,bound]. This allows for 120 + deterministic testing or custom entropy sources. Defaults to using {!Random.float}. 121 + @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3] 122 + as at least 1 character is needed for the ID and 2 for the checksum. *) 123 + 7 124 8 125 (** {1 Error Types} *) 9 126 ··· 47 164 (** {1 Constants} *) 48 165 49 166 val encoding_chars : string 50 - (** The Crockford base32 encoding alphabet (excludes i, l, o, u) *) 167 + (** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"] 51 168 52 - (** {1 Encoding and Decoding} *) 169 + This 32-character alphabet excludes I, L, O, and U to prevent confusion with 170 + visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive 171 + for decoding but returned in lowercase by encoding functions. *) 172 + 173 + (** {1 Encoding and Decoding} 174 + 175 + The core encoding and decoding functions convert between 64-bit integers and 176 + their Crockford base32 string representations. *) 53 177 54 178 val encode : 55 179 ?split_every:int -> ··· 57 181 ?checksum:bool -> 58 182 int64 -> string 59 183 (** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string. 60 - @param split_every Split the output with '-' every n characters (default: no splitting) 61 - @param min_length Pad with zeros to this minimum length (default: no padding) 62 - @param checksum Append ISO 7064 checksum as 2 digits (default: false) *) 184 + 185 + The function converts a 64-bit integer into a base32 representation using the 186 + Crockford alphabet. The encoding process divides the number by 32 repeatedly, 187 + using the remainder as an index into the alphabet. 188 + 189 + @param split_every Insert hyphens every N characters for readability. For example, 190 + [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens 191 + are ignored during decoding. Default: no splitting. 192 + @param min_length Pad the output with leading zeros to reach this minimum length. 193 + When [checksum:true], the minimum length includes the 2-character 194 + checksum. Default: no padding. 195 + @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription 196 + errors. The checksum is computed on the original number and encoded 197 + as two additional base32 characters. Default: [false]. 198 + 199 + {b Examples:} 200 + {[ 201 + encode 0L;; (* "0" *) 202 + encode 1234L;; (* "16j" *) 203 + encode ~min_length:6 1234L;; (* "00016j" *) 204 + encode ~split_every:3 123456L;; (* "3rv-5k" *) 205 + encode ~checksum:true 1234L;; (* "16j48" *) 206 + encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *) 207 + ]} *) 63 208 64 209 val decode : ?checksum:bool -> string -> int64 65 210 (** [decode ?checksum str] decodes a Crockford base32 string to int64. 66 - @param checksum Expect and validate ISO 7064 checksum (default: false) 67 - @raise Decode_error if decoding fails (invalid characters, invalid checksum format, or checksum mismatch) *) 68 211 69 - (** {1 ID Generation} *) 212 + The function is designed to be forgiving of human input: 213 + - Case-insensitive: accepts both uppercase and lowercase letters 214 + - Strips hyphens automatically 215 + - Maps confusable characters: I/i and L/l → 1, O/o → 0 216 + 217 + @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum. 218 + If [true], the function verifies that the checksum matches the 219 + decoded value. Default: [false]. 220 + 221 + @raise Decode_error with one of the following variants: 222 + - [Invalid_character] if an unrecognized character is encountered 223 + - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format 224 + - [Checksum_mismatch] if the checksum doesn't match the decoded value 70 225 71 - val generate : 72 - length:int -> 73 - ?split_every:int -> 74 - ?checksum:bool -> 75 - unit -> string 76 - (** [generate ~length ?split_every ?checksum ()] generates a random Crockford base32 string. 77 - @param length The length of the generated string (excluding checksum) 78 - @param split_every Split the output with '-' every n characters (default: no splitting) 79 - @param checksum Append ISO 7064 checksum as 2 digits (default: false) 80 - @raise Decode_error if checksum is true and length < 3 226 + {b Examples:} 227 + {[ 228 + decode "16j";; (* 1234L *) 229 + decode "16J";; (* 1234L - case insensitive *) 230 + decode "1-6-j";; (* 1234L - hyphens ignored *) 231 + decode "I6j";; (* 1234L - 'I' mapped to '1' *) 232 + decode ~checksum:true "16j48";; (* 1234L - with checksum validation *) 233 + ]} *) 81 234 82 - Note: Caller must initialize Random module with {!Random.self_init} before use *) 235 + (** {1 Utility Functions} 83 236 84 - (** {1 Utility Functions} *) 237 + Low-level functions for working with Crockford base32 strings and checksums. *) 85 238 86 239 val normalize : string -> string 87 - (** [normalize str] normalizes a string for decoding by converting to lowercase, 88 - removing dashes, and mapping confusable characters (i→1, l→1, o→0) *) 240 + (** [normalize str] normalizes a string for decoding. 241 + 242 + This function prepares a potentially messy human input string for decoding by: 243 + - Converting all characters to lowercase 244 + - Removing all hyphens ([-]) 245 + - Mapping confusable characters: [I] and [L] → [1], [O] → [0] 246 + 247 + This is automatically called by {!decode}, but is exposed for cases where 248 + you want to normalize strings before storage or comparison. 249 + 250 + {b Examples:} 251 + {[ 252 + normalize "ABC-123";; (* "abc123" *) 253 + normalize "IlO";; (* "110" - confusables mapped *) 254 + normalize "A-B-C";; (* "abc" - hyphens removed *) 255 + normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *) 256 + ]} *) 89 257 90 258 val validate : int64 -> checksum:int64 -> bool 91 - (** [validate n ~checksum] validates that a checksum matches the number *) 259 + (** [validate n ~checksum] validates that a checksum matches the expected value for a number. 260 + 261 + This function computes the ISO 7064 (mod 97-10) checksum for the given number 262 + and compares it with the provided checksum value. 263 + 264 + @param n The integer value to validate 265 + @param checksum The expected checksum value (0-96) 266 + @return [true] if the checksum is valid, [false] otherwise 267 + 268 + {b Examples:} 269 + {[ 270 + let cs = generate_checksum 1234L in 271 + validate 1234L ~checksum:cs;; (* true *) 272 + validate 1234L ~checksum:99L;; (* false *) 273 + ]} *) 92 274 93 275 val generate_checksum : int64 -> int64 94 - (** [generate_checksum n] generates an ISO 7064 (mod 97-10) checksum for a number *) 276 + (** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number. 277 + 278 + The ISO 7064 algorithm provides a checksum that can detect: 279 + - All single-digit errors 280 + - Most adjacent transposition errors 281 + - Most twin errors (where two identical digits are replaced by two other identical digits) 282 + 283 + The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits). 284 + 285 + {b Examples:} 286 + {[ 287 + generate_checksum 0L;; (* 1L *) 288 + generate_checksum 1234L;; (* 48L *) 289 + generate_checksum 123456L;; (* 87L *) 290 + ]} *)