-
-
Save CAD97/036c700fad1b4b159421eca089783122 to your computer and use it in GitHub Desktop.
raw_entry string interner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![forbid(unsafe_code)] // 🎉 | |
use { | |
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut}, | |
std::{ | |
convert::TryInto, | |
hash::{BuildHasher, Hash, Hasher}, | |
ops::Index, | |
}, | |
}; | |
trait BuildHasherExt { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64; | |
} | |
impl<S: BuildHasher> BuildHasherExt for S { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 { | |
let mut hasher = self.build_hasher(); | |
it.hash(&mut hasher); | |
hasher.finish() | |
} | |
} | |
#[derive(Debug, Copy, Clone)] | |
struct Span(Idx, Idx); | |
impl Index<Span> for str { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
impl Index<Span> for String { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
/// An interned string. | |
/// | |
/// A `Symbol` is only as big as `u32` or `usize`, whichever is smaller. | |
/// For most platforms, that means `Symbol` is a `u32`. | |
/// | |
/// Additionally, `Option<Symbol>` and `Symbol` have the same size. | |
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] | |
#[repr(transparent)] | |
pub struct Symbol { | |
raw: NonZeroIdx, | |
} | |
impl Symbol { | |
/// Convert the symbol into a `usize` appropriate for indexing side tables. | |
/// | |
/// The indexes are continuous, starting at 0. | |
pub fn ix(self) -> usize { | |
self.raw.get() as usize - 1 | |
} | |
} | |
#[derive(Debug, Copy, Clone)] | |
struct Opaque<T>(T); | |
#[cfg(not(target_pointer_width = "16"))] | |
type Idx = u32; | |
#[cfg(not(target_pointer_width = "16"))] | |
type NonZeroIdx = std::num::NonZeroU32; | |
#[cfg(target_pointer_width = "16")] | |
type Idx = u16; | |
#[cfg(target_pointer_width = "16")] | |
type NonZeroIdx = std::num::NonZeroU16; | |
/// An [interner][interning] for strings. | |
/// | |
/// The interner allows you to cache strings and instead deal in a simple | |
/// symbol, which is trivial to compare and small to copy around. | |
/// | |
/// As opposed to other interners, this interner stores all of the interned | |
/// strings in a single concatenated string. This reduces allocation space | |
/// required for the interned strings, as well as fragmentation of the memory | |
/// held by the interner. | |
/// | |
/// [interning]: <https://en.wikipedia.org/wiki/String_interning> | |
#[derive(Debug, Clone, Default)] | |
pub struct Interner<S = DefaultHashBuilder> { | |
hasher: S, | |
string_to_symbol: HashMap<Opaque<Symbol>, (), ()>, // not HashSet for raw_entry API | |
symbol_to_span: Vec<Span>, | |
span_to_string: String, | |
} | |
/// Capacity specification for an [`Interner`]. | |
#[allow(missing_docs)] | |
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Hash)] | |
pub struct Capacity { | |
pub symbols: usize, | |
pub bytes: usize, | |
} | |
impl<S: BuildHasher + Default> Interner<S> { | |
/// Creates a new empty interner. | |
/// | |
/// The default hashing algorithm is hashbrown's: [`AHash`], though this | |
/// is subject to change at any point in the future. The hash function is | |
/// very fast for all lengths of strings, but the algorithm is _not_ | |
/// guaranteed to protect against attacks such as HashDoS. | |
/// | |
/// [`AHash`]: <https://crates.io/crates/ahash> | |
pub fn new() -> Self { | |
Default::default() | |
} | |
/// Creates an empty interner with the specified capacity. | |
/// | |
/// The interner should be able to hold `capacity.symbols` distinct symbols | |
/// or `capacity.bytes` bytes worth of symbols without reallocating, | |
/// whichever is hit first. In practice, however, the internal hashmap of | |
/// symbols may potentially reallocate earlier, due to load factors. | |
pub fn with_capacity(capacity: Capacity) -> Self { | |
Self::with_hasher_and_capacity(Default::default(), capacity) | |
} | |
} | |
impl<S> Interner<S> { | |
/// Creates an empty interner which will use the given hash builder. | |
pub fn with_hasher(hash_builder: S) -> Self { | |
Interner::with_hasher_and_capacity(hash_builder, Default::default()) | |
} | |
/// Creates an empty interner with the specified capacity and hash builder. | |
/// | |
/// The interner should be able to hold `capacity.symbols` distinct symbols | |
/// or `capacity.bytes` bytes worth of symbols without reallocating, | |
/// whichever is hit first. In practice, however, the internal hashmap of | |
/// symbols may potentially reallocate earlier, due to load factors. | |
pub fn with_hasher_and_capacity(hash_builder: S, capacity: Capacity) -> Self { | |
Interner { | |
hasher: hash_builder, | |
string_to_symbol: HashMap::with_capacity_and_hasher(capacity.symbols, ()), | |
symbol_to_span: Vec::with_capacity(capacity.symbols), | |
span_to_string: String::with_capacity(capacity.bytes), | |
} | |
} | |
/// [`with_hasher_and_capacity`][Interner::with_hasher_and_capacity], | |
/// but with the standard argument order. | |
/// | |
/// Capacity second is better when using a struct literal to specify it. | |
pub fn with_capacity_and_hasher(capacity: Capacity, hash_builder: S) -> Self { | |
Interner::with_hasher_and_capacity(hash_builder, capacity) | |
} | |
/// The number of uniquely interned strings. | |
pub fn len(&self) -> usize { | |
self.symbol_to_span.len() | |
} | |
/// Returns true if the interner has no elements. | |
pub fn is_empty(&self) -> bool { | |
self.symbol_to_span.is_empty() | |
} | |
/// The number of interned bytes. | |
pub fn size(&self) -> usize { | |
self.span_to_string.len() | |
} | |
/// Returns the string associated with the given symbol. | |
pub fn resolve(&self, s: Symbol) -> Option<&str> { | |
let span = *self.symbol_to_span.get(s.ix())?; | |
Some(&self.span_to_string[span]) | |
} | |
} | |
impl<S> Index<Symbol> for Interner<S> { | |
type Output = str; | |
fn index(&self, s: Symbol) -> &str { | |
let span = self.symbol_to_span[s.ix()]; | |
&self.span_to_string[span] | |
} | |
} | |
fn insert_substring(string: &mut String, s: &str) -> Span { | |
// if let Some(index) = string.find(s) { | |
// Span(index as u32, (index + s.len()) as u32) | |
// } else { | |
let start = string.len() as u32; | |
let end = (string.len() + s.len()) | |
.try_into() | |
.unwrap_or_else(|_| panic!("Interner overflow")); | |
string.push_str(s); | |
Span(start, end) | |
// } | |
} | |
impl<S: BuildHasher> Interner<S> { | |
/// Interns the given value. | |
/// Returns a symbol for the string in this interner. | |
/// | |
/// If the string is already in the interner, this is copy-free. | |
/// If the string is not yet in the interner, it is copied in. | |
pub fn get_or_insert(&mut self, s: &str) -> Symbol { | |
let Interner { | |
hasher, | |
string_to_symbol, | |
symbol_to_span, | |
span_to_string, | |
} = self; | |
let hash = hasher.hash(s); | |
let entry = string_to_symbol | |
.raw_entry_mut() | |
.from_hash(hash, |&Opaque(symbol)| { | |
&span_to_string[symbol_to_span[symbol.ix()]] == s | |
}); | |
match entry { | |
RawEntryMut::Occupied(place) => { | |
let &Opaque(symbol) = place.key(); | |
symbol | |
} | |
RawEntryMut::Vacant(place) => { | |
let span = insert_substring(span_to_string, s); | |
symbol_to_span.push(span); | |
let symbol = Symbol { | |
raw: NonZeroIdx::new( | |
symbol_to_span | |
.len() | |
.try_into() | |
.unwrap_or_else(|_| panic!("Interner overflow")), | |
) | |
.unwrap(), | |
}; | |
let (&mut Opaque(symbol), &mut ()) = | |
place.insert_with_hasher(hash, Opaque(symbol), (), |&Opaque(symbol)| { | |
hasher.hash(&span_to_string[symbol_to_span[symbol.ix()]]) | |
}); | |
symbol | |
} | |
} | |
} | |
/// Get the interned symbol for this string, | |
/// but do not insert it if it is missing. | |
pub fn get(&self, s: &str) -> Option<Symbol> { | |
let Interner { | |
hasher, | |
string_to_symbol, | |
symbol_to_span, | |
span_to_string, | |
} = self; | |
let hash = hasher.hash(s); | |
let entry = string_to_symbol | |
.raw_entry() | |
.from_hash(hash, |&Opaque(symbol)| { | |
&span_to_string[symbol_to_span[symbol.ix()]] == s | |
}); | |
entry.map(|(&Opaque(symbol), &())| symbol) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![forbid(unsafe_code)] // 🎉 | |
use { | |
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut}, | |
std::{ | |
convert::TryInto, | |
hash::{BuildHasher, Hash, Hasher}, | |
ops::Index, | |
}, | |
}; | |
trait BuildHasherExt { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64; | |
} | |
impl<S: BuildHasher> BuildHasherExt for S { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 { | |
let mut hasher = self.build_hasher(); | |
it.hash(&mut hasher); | |
hasher.finish() | |
} | |
} | |
#[derive(Copy, Clone)] | |
struct Span(Idx, Idx); | |
impl Index<Span> for str { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
impl Index<Span> for String { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
#[derive(Copy, Clone)] | |
pub struct Symbol { | |
raw: Idx, | |
} | |
#[cfg(not(target_pointer_width = "16"))] | |
type Idx = u32; | |
#[cfg(target_pointer_width = "16")] | |
type Idx = u16; | |
pub struct Interner<S = DefaultHashBuilder> { | |
string_to_symbol: HashMap<Span, Symbol, S>, | |
symbol_to_span: Vec<Span>, | |
span_to_string: String, | |
} | |
impl<S: BuildHasher + Clone> Interner<S> { | |
pub fn get_or_insert(&mut self, s: &str) -> Symbol { | |
let Interner { | |
string_to_symbol, | |
symbol_to_span, | |
span_to_string, | |
} = self; | |
// We have to get an owned copy of the hasher, because otherwise the | |
// shared reference conflicts with the raw_entry_mut unique reference. | |
let hasher = string_to_symbol.hasher().clone(); | |
let hash = hasher.hash(s); | |
let entry = string_to_symbol | |
.raw_entry_mut() | |
.from_hash(hash, |&span| &span_to_string[span] == s); | |
match entry { | |
RawEntryMut::Occupied(place) => *place.get(), | |
RawEntryMut::Vacant(place) => { | |
let start = span_to_string.len() as Idx; | |
let end = (span_to_string.len() + s.len()) | |
.try_into() | |
.unwrap_or_else(|_| panic!("Interner overflow")); | |
let span = Span(start, end); | |
span_to_string.push_str(s); | |
let symbol = Symbol { | |
// This cast is lossless because `string_buffer.len()` will overflow Idx first | |
// Proof: there is 1 empty string and fewer than 255 1-byte strings. | |
// Thus with more than 256 strings, the buffer must be longer, QED. | |
raw: symbol_to_span.len() as Idx, | |
}; | |
symbol_to_span.push(span); | |
let (&mut _span, &mut symbol) = | |
place.insert_with_hasher(hash, span, symbol, |&span| { | |
hasher.hash(&span_to_string[span]) | |
}); | |
symbol | |
} | |
} | |
} | |
} | |
impl<S: BuildHasher> Interner<S> { | |
pub fn get(&self, s: &str) -> Option<Symbol> { | |
let Interner { | |
string_to_symbol, | |
symbol_to_span: _, | |
span_to_string, | |
} = self; | |
let hash = string_to_symbol.hasher().hash(s); | |
let entry = string_to_symbol.raw_entry().from_hash(hash, |span| { | |
&span_to_string[span.0 as usize..span.1 as usize] == s | |
}); | |
entry.map(|(&_span, &symbol)| symbol) | |
} | |
pub fn resolve(&self, s: Symbol) -> &str { | |
let span = self.symbol_to_span[s.raw as usize]; | |
&self.span_to_string[span] | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![forbid(unsafe_code)] // 🎉 | |
use { | |
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut}, | |
std::{ | |
convert::TryInto, | |
hash::{BuildHasher, Hash, Hasher}, | |
ops::Index, | |
}, | |
}; | |
trait BuildHasherExt { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64; | |
} | |
impl<S: BuildHasher> BuildHasherExt for S { | |
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 { | |
let mut hasher = self.build_hasher(); | |
it.hash(&mut hasher); | |
hasher.finish() | |
} | |
} | |
#[derive(Copy, Clone)] | |
struct Span(Idx, Idx); | |
impl Index<Span> for str { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
impl Index<Span> for String { | |
type Output = str; | |
fn index(&self, span: Span) -> &Self::Output { | |
&self[span.0 as usize..span.1 as usize] | |
} | |
} | |
#[derive(Copy, Clone)] | |
pub struct Symbol { | |
raw: Idx, | |
} | |
#[cfg(not(target_pointer_width = "16"))] | |
type Idx = u32; | |
#[cfg(target_pointer_width = "16")] | |
type Idx = u16; | |
pub struct Interner<S = DefaultHashBuilder> { | |
string_to_symbol: HashMap<Symbol, (), S>, | |
symbol_to_span: Vec<Span>, | |
span_to_string: String, | |
} | |
impl<S: BuildHasher + Clone> Interner<S> { | |
pub fn get_or_insert(&mut self, s: &str) -> Symbol { | |
let Interner { | |
string_to_symbol, | |
symbol_to_span, | |
span_to_string, | |
} = self; | |
// We have to get an owned copy of the hasher, because otherwise the | |
// shared reference conflicts with the raw_entry_mut unique reference. | |
let hasher = string_to_symbol.hasher().clone(); | |
let hash = hasher.hash(s); | |
let entry = string_to_symbol.raw_entry_mut().from_hash(hash, |&symbol| { | |
&span_to_string[symbol_to_span[symbol.raw as usize]] == s | |
}); | |
match entry { | |
RawEntryMut::Occupied(place) => *place.key(), | |
RawEntryMut::Vacant(place) => { | |
let start = span_to_string.len() as Idx; | |
let end = (span_to_string.len() + s.len()) | |
.try_into() | |
.unwrap_or_else(|_| panic!("Interner overflow")); | |
let span = Span(start, end); | |
span_to_string.push_str(s); | |
let symbol = Symbol { | |
// This cast is lossless because `string_buffer.len()` will overflow Idx first | |
// Proof: there is 1 empty string and fewer than 255 1-byte strings. | |
// Thus with more than 256 strings, the buffer must be longer, QED. | |
raw: symbol_to_span.len() as Idx, | |
}; | |
symbol_to_span.push(span); | |
let (&mut symbol, &mut ()) = | |
place.insert_with_hasher(hash, symbol, (), |&symbol| { | |
hasher.hash(&span_to_string[symbol_to_span[symbol.raw as usize]]) | |
}); | |
symbol | |
} | |
} | |
} | |
} | |
impl<S: BuildHasher> Interner<S> { | |
pub fn get(&self, s: &str) -> Option<Symbol> { | |
let Interner { | |
string_to_symbol, | |
symbol_to_span, | |
span_to_string, | |
} = self; | |
let hash = string_to_symbol.hasher().hash(s); | |
let entry = string_to_symbol.raw_entry().from_hash(hash, |&symbol| { | |
&span_to_string[symbol_to_span[symbol.raw as usize]] == s | |
}); | |
entry.map(|(&symbol, &())| symbol) | |
} | |
pub fn resolve(&self, s: Symbol) -> &str { | |
let span = self.symbol_to_span[s.raw as usize]; | |
&self.span_to_string[span] | |
} | |
} |
Whoops, the typo is fixed now. That's just to be nicer to 16 bit targets, anyway 😛
@CAD97 did you ever end up publishing this anywhere else? I'd love to use this.
@bbqsrc Not yet, but I've now polished it up and packaged it in a crate that I'm happy to publish if anyone helps write some tests/examples.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Typo: type Ix = u16;