From a518ab62bb22f8055a2b8aa89d10b5779839984d Mon Sep 17 00:00:00 2001 From: Stephen Chung Date: Tue, 27 Sep 2022 08:52:51 +0800 Subject: [PATCH] Simplify strings interner. --- src/parser.rs | 25 ++++++++------ src/types/interner.rs | 76 ++++++++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 44 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 9941fc53..6cdf3d18 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -47,9 +47,6 @@ const NEVER_ENDS: &str = "`Token`"; /// Unroll `switch` ranges no larger than this. const SMALL_SWITCH_RANGE: INT = 16; -/// Number of string interners used: two additional for property getters/setters if not `no_object` -const NUM_INTERNERS: usize = if cfg!(feature = "no_object") { 1 } else { 3 }; - /// _(internals)_ A type that encapsulates the current state of the parser. /// Exported under the `internals` feature only. pub struct ParseState<'e> { @@ -58,7 +55,7 @@ pub struct ParseState<'e> { /// Controls whether parsing of an expression should stop given the next token. pub expr_filter: fn(&Token) -> bool, /// String interners. - interned_strings: [StringsInterner<'e>; NUM_INTERNERS], + interned_strings: StringsInterner<'e>, /// External [scope][Scope] with constants. pub scope: &'e Scope<'e>, /// Global runtime state. @@ -88,6 +85,8 @@ pub struct ParseState<'e> { } impl fmt::Debug for ParseState<'_> { + #[cold] + #[inline(never)] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut f = f.debug_struct("ParseState"); @@ -116,7 +115,7 @@ impl<'e> ParseState<'e> { pub fn new( engine: &Engine, scope: &'e Scope, - interned_strings: [StringsInterner<'e>; NUM_INTERNERS], + interned_strings: StringsInterner<'e>, tokenizer_control: TokenizerControl, ) -> Self { Self { @@ -254,7 +253,7 @@ impl<'e> ParseState<'e> { &mut self, text: impl AsRef + Into, ) -> ImmutableString { - self.interned_strings[0].get(text) + self.interned_strings.get(text) } /// Get an interned property getter, creating one if it is not yet interned. @@ -265,8 +264,11 @@ impl<'e> ParseState<'e> { &mut self, text: impl AsRef + Into, ) -> ImmutableString { - self.interned_strings[1] - .get_with_mapper(|s| crate::engine::make_getter(s.as_ref()).into(), text) + self.interned_strings.get_with_mapper( + crate::engine::FN_GET, + |s| crate::engine::make_getter(s.as_ref()).into(), + text, + ) } /// Get an interned property setter, creating one if it is not yet interned. @@ -277,8 +279,11 @@ impl<'e> ParseState<'e> { &mut self, text: impl AsRef + Into, ) -> ImmutableString { - self.interned_strings[2] - .get_with_mapper(|s| crate::engine::make_setter(s.as_ref()).into(), text) + self.interned_strings.get_with_mapper( + crate::engine::FN_SET, + |s| crate::engine::make_setter(s.as_ref()).into(), + text, + ) } } diff --git a/src/types/interner.rs b/src/types/interner.rs index 3dfa1048..7ae35a35 100644 --- a/src/types/interner.rs +++ b/src/types/interner.rs @@ -1,3 +1,4 @@ +use super::BloomFilterU64; use crate::func::{hashing::get_hasher, StraightHashMap}; use crate::ImmutableString; #[cfg(feature = "no_std")] @@ -14,7 +15,7 @@ use std::{ }; /// Maximum number of strings interned. -pub const MAX_INTERNED_STRINGS: usize = 256; +pub const MAX_INTERNED_STRINGS: usize = 1024; /// Maximum length of strings interned. pub const MAX_STRING_LEN: usize = 24; @@ -28,8 +29,10 @@ pub struct StringsInterner<'a> { pub capacity: usize, /// Maximum string length. pub max_string_len: usize, - /// Normal strings. - strings: StraightHashMap, + /// Cached strings. + cache: StraightHashMap, + /// Bloom filter to avoid caching "one-hit wonders". + filter: BloomFilterU64, /// Take care of the lifetime parameter. dummy: PhantomData<&'a ()>, } @@ -42,9 +45,10 @@ impl Default for StringsInterner<'_> { } impl fmt::Debug for StringsInterner<'_> { - #[inline] + #[cold] + #[inline(never)] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_list().entries(self.strings.values()).finish() + f.debug_list().entries(self.cache.values()).finish() } } @@ -56,7 +60,8 @@ impl StringsInterner<'_> { Self { capacity: MAX_INTERNED_STRINGS, max_string_len: MAX_STRING_LEN, - strings: StraightHashMap::default(), + cache: StraightHashMap::default(), + filter: BloomFilterU64::new(), dummy: PhantomData, } } @@ -65,7 +70,7 @@ impl StringsInterner<'_> { #[inline(always)] #[must_use] pub fn get + Into>(&mut self, text: S) -> ImmutableString { - self.get_with_mapper(Into::into, text) + self.get_with_mapper("", Into::into, text) } /// Get an identifier from a text string, adding it to the interner if necessary. @@ -73,20 +78,23 @@ impl StringsInterner<'_> { #[must_use] pub fn get_with_mapper>( &mut self, + id: &str, mapper: impl Fn(S) -> ImmutableString, text: S, ) -> ImmutableString { let key = text.as_ref(); - if key.len() > MAX_STRING_LEN { + let hasher = &mut get_hasher(); + id.hash(hasher); + key.hash(hasher); + let hash = hasher.finish(); + + // Cache long strings only on the second try to avoid caching "one-hit wonders". + if key.len() > MAX_STRING_LEN && self.filter.is_absent_and_set(hash) { return mapper(text); } - let hasher = &mut get_hasher(); - key.hash(hasher); - let key = hasher.finish(); - - let result = match self.strings.entry(key) { + let result = match self.cache.entry(hash) { Entry::Occupied(e) => return e.get().clone(), Entry::Vacant(e) => { let value = mapper(text); @@ -100,7 +108,7 @@ impl StringsInterner<'_> { }; // If the interner is over capacity, remove the longest entry that has the lowest count - if self.strings.len() > self.capacity { + if self.cache.len() > self.capacity { // Leave some buffer to grow when shrinking the cache. // We leave at least two entries, one for the empty string, and one for the string // that has just been inserted. @@ -110,21 +118,21 @@ impl StringsInterner<'_> { self.capacity - 3 }; - while self.strings.len() > max { - let (_, _, n) = - self.strings - .iter() - .fold((0, usize::MAX, 0), |(x, c, n), (&k, v)| { - if k != key - && (v.strong_count() < c || (v.strong_count() == c && v.len() > x)) - { - (v.len(), v.strong_count(), k) - } else { - (x, c, n) - } - }); + while self.cache.len() > max { + let (_, _, n) = self + .cache + .iter() + .fold((0, usize::MAX, 0), |(x, c, n), (&k, v)| { + if k != hash + && (v.strong_count() < c || (v.strong_count() == c && v.len() > x)) + { + (v.len(), v.strong_count(), k) + } else { + (x, c, n) + } + }); - self.strings.remove(&n); + self.cache.remove(&n); } } @@ -136,7 +144,7 @@ impl StringsInterner<'_> { #[must_use] #[allow(dead_code)] pub fn len(&self) -> usize { - self.strings.len() + self.cache.len() } /// Returns `true` if there are no interned strings. @@ -144,28 +152,28 @@ impl StringsInterner<'_> { #[must_use] #[allow(dead_code)] pub fn is_empty(&self) -> bool { - self.strings.is_empty() + self.cache.is_empty() } /// Clear all interned strings. #[inline(always)] #[allow(dead_code)] pub fn clear(&mut self) { - self.strings.clear(); + self.cache.clear(); } } impl AddAssign for StringsInterner<'_> { #[inline(always)] fn add_assign(&mut self, rhs: Self) { - self.strings.extend(rhs.strings.into_iter()); + self.cache.extend(rhs.cache.into_iter()); } } impl AddAssign<&Self> for StringsInterner<'_> { #[inline(always)] fn add_assign(&mut self, rhs: &Self) { - self.strings - .extend(rhs.strings.iter().map(|(&k, v)| (k, v.clone()))); + self.cache + .extend(rhs.cache.iter().map(|(&k, v)| (k, v.clone()))); } }