+9
-5
crates/nailkov/src/lib.rs
+9
-5
crates/nailkov/src/lib.rs
···
20
20
use token::{Token, TokenPair};
21
21
use unicode_segmentation::UnicodeSegmentation;
22
22
23
+
/// `nailkov` relies on `estr`'s precomputed hashes, so we avoid
24
+
/// hashing ourselves and can just use the precomputed hashes instead.
25
+
type TokenHasher = BuildHasherDefault<IdentityHasher>;
26
+
23
27
#[derive(Clone, Debug)]
24
28
pub struct NailKov {
25
-
chain: CachePadded<IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>>>,
29
+
chain: CachePadded<IndexMap<TokenPair, TokenWeights, TokenHasher>>,
26
30
}
27
31
28
32
pub struct NailKovIter<'a, R: RngCore> {
···
61
65
62
66
impl NailKov {
63
67
pub fn from_input(input: &str) -> Result<NailKov, NailError> {
64
-
NailBuilder::new(BuildHasherDefault::new()).with_input(input)
68
+
NailBuilder::new(TokenHasher::new()).with_input(input)
65
69
}
66
70
}
67
71
68
72
struct NailBuilder {
69
-
chain: IndexMap<TokenPair, TokenWeightsBuilder, BuildHasherDefault<IdentityHasher>>,
73
+
chain: IndexMap<TokenPair, TokenWeightsBuilder, TokenHasher>,
70
74
}
71
75
72
76
impl NailBuilder {
73
-
fn new(hasher: BuildHasherDefault<IdentityHasher>) -> Self {
77
+
fn new(hasher: TokenHasher) -> Self {
74
78
Self {
75
79
chain: IndexMap::with_hasher(hasher),
76
80
}
···
85
89
return Err(NailError::EmptyInput);
86
90
}
87
91
88
-
let chain: IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>> = self
92
+
let chain: IndexMap<TokenPair, TokenWeights, TokenHasher> = self
89
93
.chain
90
94
.into_iter()
91
95
.flat_map(|(pair, dist)| {
+6
-2
crates/nailkov/src/token.rs
+6
-2
crates/nailkov/src/token.rs
···
47
47
impl core::hash::Hash for TokenPair {
48
48
#[inline]
49
49
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
50
+
// Use only with an IdentityHasher so that you don't rehash the hash
50
51
self.double_hash().hash(state);
51
52
}
52
53
}
···
57
58
Self { left, right }
58
59
}
59
60
61
+
/// Use the precomputed hashes to generate a secondary hash.
62
+
/// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190),
63
+
/// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>.
60
64
#[inline(always)]
61
65
fn double_hash(&self) -> u64 {
62
-
self.right
66
+
self.left
63
67
.0
64
68
.digest()
65
69
.hash()
66
-
.wrapping_add(self.left.0.digest().hash())
70
+
.wrapping_add(self.right.0.digest().hash())
67
71
.rotate_left(5)
68
72
}
69
73
}
+2
-5
crates/nailkov/src/distribution.rs
+2
-5
crates/nailkov/src/distribution.rs
···
1
1
//! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with
2
2
//! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov).
3
3
4
-
use core::hash::BuildHasherDefault;
5
-
6
-
use estr::IdentityHasher;
7
4
use indexmap::IndexMap;
8
5
use rand::Rng;
9
6
use rand_distr::{Distribution, weighted::WeightedAliasIndex};
10
7
11
-
use crate::{error::NailError, token::Token};
8
+
use crate::{TokenHasher, error::NailError, token::Token};
12
9
13
10
/// A distribution of choices and their likelihood.
14
11
#[derive(Clone, Debug)]
···
32
29
#[derive(Clone, Debug)]
33
30
pub struct TokenWeightsBuilder {
34
31
/// Counts how many times a token is likely to appear.
35
-
occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>,
32
+
occurrences: IndexMap<Token, u32, TokenHasher>,
36
33
}
37
34
38
35
impl TokenWeightsBuilder {
History
3 rounds
0 comments
expand 0 comments
pull request successfully merged
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation
2/2 success
expand
collapse
expand 0 comments
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation