perf: Improve TokenPair hashing for faster generation #6

+17 -12

3 changed files

Interdiff #0 → #1

expand all

crates

nailkov

src

distribution.rs

lib.rs

token.rs

+9 -5

crates/nailkov/src/lib.rs

··· 20 20 use token::{Token, TokenPair}; 21 21 use unicode_segmentation::UnicodeSegmentation; 22 22 23 + /// `nailkov` relies on `estr`'s precomputed hashes, so we avoid 24 + /// hashing ourselves and can just use the precomputed hashes instead. 25 + type TokenHasher = BuildHasherDefault<IdentityHasher>; 26 + 23 27 #[derive(Clone, Debug)] 24 28 pub struct NailKov { 25 - chain: CachePadded<IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>>>, 29 + chain: CachePadded<IndexMap<TokenPair, TokenWeights, TokenHasher>>, 26 30 } 27 31 28 32 pub struct NailKovIter<'a, R: RngCore> { ··· 61 65 62 66 impl NailKov { 63 67 pub fn from_input(input: &str) -> Result<NailKov, NailError> { 64 - NailBuilder::new(BuildHasherDefault::new()).with_input(input) 68 + NailBuilder::new(TokenHasher::new()).with_input(input) 65 69 } 66 70 } 67 71 68 72 struct NailBuilder { 69 - chain: IndexMap<TokenPair, TokenWeightsBuilder, BuildHasherDefault<IdentityHasher>>, 73 + chain: IndexMap<TokenPair, TokenWeightsBuilder, TokenHasher>, 70 74 } 71 75 72 76 impl NailBuilder { 73 - fn new(hasher: BuildHasherDefault<IdentityHasher>) -> Self { 77 + fn new(hasher: TokenHasher) -> Self { 74 78 Self { 75 79 chain: IndexMap::with_hasher(hasher), 76 80 } ··· 85 89 return Err(NailError::EmptyInput); 86 90 } 87 91 88 - let chain: IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>> = self 92 + let chain: IndexMap<TokenPair, TokenWeights, TokenHasher> = self 89 93 .chain 90 94 .into_iter() 91 95 .flat_map(|(pair, dist)| {

+6 -2

crates/nailkov/src/token.rs

··· 47 47 impl core::hash::Hash for TokenPair { 48 48 #[inline] 49 49 fn hash<H: core::hash::Hasher>(&self, state: &mut H) { 50 + // Use only with an IdentityHasher so that you don't rehash the hash 50 51 self.double_hash().hash(state); 51 52 } 52 53 } ··· 57 58 Self { left, right } 58 59 } 59 60 61 + /// Use the precomputed hashes to generate a secondary hash. 62 + /// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190), 63 + /// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>. 60 64 #[inline(always)] 61 65 fn double_hash(&self) -> u64 { 62 - self.right 66 + self.left 63 67 .0 64 68 .digest() 65 69 .hash() 66 - .wrapping_add(self.left.0.digest().hash()) 70 + .wrapping_add(self.right.0.digest().hash()) 67 71 .rotate_left(5) 68 72 } 69 73 }

+2 -5

crates/nailkov/src/distribution.rs

··· 1 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 3 4 - use core::hash::BuildHasherDefault; 5 - 6 - use estr::IdentityHasher; 7 4 use indexmap::IndexMap; 8 5 use rand::Rng; 9 6 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 10 7 11 - use crate::{error::NailError, token::Token}; 8 + use crate::{TokenHasher, error::NailError, token::Token}; 12 9 13 10 /// A distribution of choices and their likelihood. 14 11 #[derive(Clone, Debug)] ··· 32 29 #[derive(Clone, Debug)] 33 30 pub struct TokenWeightsBuilder { 34 31 /// Counts how many times a token is likely to appear. 35 - occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>, 32 + occurrences: IndexMap<Token, u32, TokenHasher>, 36 33 } 37 34 38 35 impl TokenWeightsBuilder {