+22
-28
Diff
round #0
+11
-7
crates/nailkov/src/lib.rs
+11
-7
crates/nailkov/src/lib.rs
···
6
6
mod error;
7
7
mod token;
8
8
9
+
use core::hash::BuildHasherDefault;
10
+
9
11
use crossbeam_utils::CachePadded;
10
12
use error::NailError;
13
+
use estr::IdentityHasher;
11
14
use indexmap::IndexMap;
12
15
use itertools::Itertools;
13
16
use rand::{RngCore, seq::IteratorRandom};
14
17
use rand_distr::Distribution;
15
18
16
19
use distribution::{TokenWeights, TokenWeightsBuilder};
17
-
use rapidhash::fast::RandomState;
18
20
use token::{Token, TokenPair};
19
21
use unicode_segmentation::UnicodeSegmentation;
20
22
21
23
#[derive(Clone, Debug)]
22
24
pub struct NailKov {
23
-
chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>,
25
+
chain: CachePadded<IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>>>,
24
26
}
25
27
26
28
pub struct NailKovIter<'a, R: RngCore> {
···
59
61
60
62
impl NailKov {
61
63
pub fn from_input(input: &str) -> Result<NailKov, NailError> {
62
-
NailBuilder::new(RandomState::new()).with_input(input)
64
+
NailBuilder::new(BuildHasherDefault::new()).with_input(input)
63
65
}
64
66
}
65
67
66
68
struct NailBuilder {
67
-
chain: IndexMap<TokenPair, TokenWeightsBuilder, RandomState>,
69
+
chain: IndexMap<TokenPair, TokenWeightsBuilder, BuildHasherDefault<IdentityHasher>>,
68
70
}
69
71
70
72
impl NailBuilder {
71
-
fn new(hasher: RandomState) -> Self {
73
+
fn new(hasher: BuildHasherDefault<IdentityHasher>) -> Self {
72
74
Self {
73
75
chain: IndexMap::with_hasher(hasher),
74
76
}
···
83
85
return Err(NailError::EmptyInput);
84
86
}
85
87
86
-
let chain: IndexMap<TokenPair, TokenWeights, RandomState> = self
88
+
let chain: IndexMap<TokenPair, TokenWeights, BuildHasherDefault<IdentityHasher>> = self
87
89
.chain
88
90
.into_iter()
89
91
.flat_map(|(pair, dist)| {
···
97
99
return Err(NailError::EmptyInput);
98
100
}
99
101
100
-
Ok(NailKov { chain: CachePadded::new(chain) })
102
+
Ok(NailKov {
103
+
chain: CachePadded::new(chain),
104
+
})
101
105
}
102
106
103
107
/// Add the occurrence of `next` following `prev`.
+11
-21
crates/nailkov/src/token.rs
+11
-21
crates/nailkov/src/token.rs
···
29
29
}
30
30
31
31
/// An owned pair of [`Token`]s.
32
-
#[derive(Copy, Clone, Debug)]
32
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
33
33
// Alignment repr necessary to allow LLVM to better output
34
-
// optimized codegen for `to_bits`, `PartialEq`
35
34
// Prior art taken from my contribution to Bevy:
36
35
// https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309
37
36
#[repr(C, align(16))]
38
37
pub struct TokenPair {
39
38
// Do not reorder the fields here. The ordering is explicitly used by repr(C)
40
-
// to make this struct equivalent to a u64.
39
+
// to make this struct equivalent to a u128.
41
40
#[cfg(target_endian = "little")]
42
41
pub left: Token,
43
42
pub right: Token,
···
45
44
pub left: Token,
46
45
}
47
46
48
-
// By not short-circuiting in comparisons, we get better codegen.
49
-
// See <https://github.com/rust-lang/rust/issues/117800>
50
-
impl PartialEq for TokenPair {
51
-
#[inline(always)]
52
-
fn eq(&self, other: &TokenPair) -> bool {
53
-
// By using `to_bits`, the codegen can be optimized out even
54
-
// further potentially. Relies on the correct alignment/field
55
-
// order of `TokenPair`.
56
-
self.to_bits() == other.to_bits()
57
-
}
58
-
}
59
-
60
-
impl Eq for TokenPair {}
61
-
62
47
impl core::hash::Hash for TokenPair {
63
-
#[inline(always)]
48
+
#[inline]
64
49
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
65
-
self.to_bits().hash(state);
50
+
self.double_hash().hash(state);
66
51
}
67
52
}
68
53
···
73
58
}
74
59
75
60
#[inline(always)]
76
-
fn to_bits(self) -> u128 {
77
-
(self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64)
61
+
fn double_hash(&self) -> u64 {
62
+
self.right
63
+
.0
64
+
.digest()
65
+
.hash()
66
+
.wrapping_add(self.left.0.digest().hash())
67
+
.rotate_left(5)
78
68
}
79
69
}
80
70
History
3 rounds
0 comments
expand 0 comments
pull request successfully merged
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation
2/2 success
expand
collapse
expand 0 comments
sachy.dev
submitted
#0
1 commit
expand
collapse
perf: Improve TokenPair hashing for faster generation