+32
-33
Diff
round #1
+2
-5
crates/nailkov/src/distribution.rs
+2
-5
crates/nailkov/src/distribution.rs
···
1
1
//! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with
2
2
//! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov).
3
3
4
-
use core::hash::BuildHasherDefault;
5
-
6
-
use estr::IdentityHasher;
7
4
use indexmap::IndexMap;
8
5
use rand::Rng;
9
6
use rand_distr::{Distribution, weighted::WeightedAliasIndex};
10
7
11
-
use crate::{error::NailError, token::Token};
8
+
use crate::{TokenHasher, error::NailError, token::Token};
12
9
13
10
/// A distribution of choices and their likelihood.
14
11
#[derive(Clone, Debug)]
···
32
29
#[derive(Clone, Debug)]
33
30
pub struct TokenWeightsBuilder {
34
31
/// Counts how many times a token is likely to appear.
35
-
occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>,
32
+
occurrences: IndexMap<Token, u32, TokenHasher>,
36
33
}
37
34
38
35
impl TokenWeightsBuilder {
+15
-7
crates/nailkov/src/lib.rs
+15
-7
crates/nailkov/src/lib.rs
···
6
6
mod error;
7
7
mod token;
8
8
9
+
use core::hash::BuildHasherDefault;
10
+
9
11
use crossbeam_utils::CachePadded;
10
12
use error::NailError;
13
+
use estr::IdentityHasher;
11
14
use indexmap::IndexMap;
12
15
use itertools::Itertools;
13
16
use rand::{RngCore, seq::IteratorRandom};
14
17
use rand_distr::Distribution;
15
18
16
19
use distribution::{TokenWeights, TokenWeightsBuilder};
17
-
use rapidhash::fast::RandomState;
18
20
use token::{Token, TokenPair};
19
21
use unicode_segmentation::UnicodeSegmentation;
20
22
23
+
/// `nailkov` relies on `estr`'s precomputed hashes, so we avoid
24
+
/// hashing ourselves and can just use the precomputed hashes instead.
25
+
type TokenHasher = BuildHasherDefault<IdentityHasher>;
26
+
21
27
#[derive(Clone, Debug)]
22
28
pub struct NailKov {
23
-
chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>,
29
+
chain: CachePadded<IndexMap<TokenPair, TokenWeights, TokenHasher>>,
24
30
}
25
31
26
32
pub struct NailKovIter<'a, R: RngCore> {
···
59
65
60
66
impl NailKov {
61
67
pub fn from_input(input: &str) -> Result<NailKov, NailError> {
62
-
NailBuilder::new(RandomState::new()).with_input(input)
68
+
NailBuilder::new(TokenHasher::new()).with_input(input)
63
69
}
64
70
}
65
71
66
72
struct NailBuilder {
67
-
chain: IndexMap<TokenPair, TokenWeightsBuilder, RandomState>,
73
+
chain: IndexMap<TokenPair, TokenWeightsBuilder, TokenHasher>,
68
74
}
69
75
70
76
impl NailBuilder {
71
-
fn new(hasher: RandomState) -> Self {
77
+
fn new(hasher: TokenHasher) -> Self {
72
78
Self {
73
79
chain: IndexMap::with_hasher(hasher),
74
80
}
···
83
89
return Err(NailError::EmptyInput);
84
90
}
85
91
86
-
let chain: IndexMap<TokenPair, TokenWeights, RandomState> = self
92
+
let chain: IndexMap<TokenPair, TokenWeights, TokenHasher> = self
87
93
.chain
88
94
.into_iter()
89
95
.flat_map(|(pair, dist)| {
···
97
103
return Err(NailError::EmptyInput);
98
104
}
99
105
100
-
Ok(NailKov { chain: CachePadded::new(chain) })
106
+
Ok(NailKov {
107
+
chain: CachePadded::new(chain),
108
+
})
101
109
}
102
110
103
111
/// Add the occurrence of `next` following `prev`.
+15
-21
crates/nailkov/src/token.rs
+15
-21
crates/nailkov/src/token.rs
···
29
29
}
30
30
31
31
/// An owned pair of [`Token`]s.
32
-
#[derive(Copy, Clone, Debug)]
32
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
33
33
// Alignment repr necessary to allow LLVM to better output
34
-
// optimized codegen for `to_bits`, `PartialEq`
35
34
// Prior art taken from my contribution to Bevy:
36
35
// https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309
37
36
#[repr(C, align(16))]
38
37
pub struct TokenPair {
39
38
// Do not reorder the fields here. The ordering is explicitly used by repr(C)
40
-
// to make this struct equivalent to a u64.
39
+
// to make this struct equivalent to a u128.
41
40
#[cfg(target_endian = "little")]
42
41
pub left: Token,
43
42
pub right: Token,
···
45
44
pub left: Token,
46
45
}
47
46
48
-
// By not short-circuiting in comparisons, we get better codegen.
49
-
// See <https://github.com/rust-lang/rust/issues/117800>
50
-
impl PartialEq for TokenPair {
51
-
#[inline(always)]
52
-
fn eq(&self, other: &TokenPair) -> bool {
53
-
// By using `to_bits`, the codegen can be optimized out even
54
-
// further potentially. Relies on the correct alignment/field
55
-
// order of `TokenPair`.
56
-
self.to_bits() == other.to_bits()
57
-
}
58
-
}
59
-
60
-
impl Eq for TokenPair {}
61
-
62
47
impl core::hash::Hash for TokenPair {
63
-
#[inline(always)]
48
+
#[inline]
64
49
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
65
-
self.to_bits().hash(state);
50
+
// Use only with an IdentityHasher so that you don't rehash the hash
51
+
self.double_hash().hash(state);
66
52
}
67
53
}
68
54
···
72
58
Self { left, right }
73
59
}
74
60
61
+
/// Use the precomputed hashes to generate a secondary hash.
62
+
/// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190),
63
+
/// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>.
75
64
#[inline(always)]
76
-
fn to_bits(self) -> u128 {
77
-
(self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64)
65
+
fn double_hash(&self) -> u64 {
66
+
self.left
67
+
.0
68
+
.digest()
69
+
.hash()
70
+
.wrapping_add(self.right.0.digest().hash())
71
+
.rotate_left(5)
78
72
}
79
73
}
80
74
History
3 rounds
0 comments
expand 0 comments
pull request successfully merged
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation
2/2 success
expand
collapse
expand 0 comments
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation