Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@CAD97
Last active September 11, 2020 19:51
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CAD97/036c700fad1b4b159421eca089783122 to your computer and use it in GitHub Desktop.
Save CAD97/036c700fad1b4b159421eca089783122 to your computer and use it in GitHub Desktop.
raw_entry string interner
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Debug, Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
/// An interned string.
///
/// A `Symbol` is only as big as `u32` or `usize`, whichever is smaller.
/// For most platforms, that means `Symbol` is a `u32`.
///
/// Additionally, `Option<Symbol>` and `Symbol` have the same size.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
#[repr(transparent)]
pub struct Symbol {
raw: NonZeroIdx,
}
impl Symbol {
/// Convert the symbol into a `usize` appropriate for indexing side tables.
///
/// The indexes are continuous, starting at 0.
pub fn ix(self) -> usize {
self.raw.get() as usize - 1
}
}
#[derive(Debug, Copy, Clone)]
struct Opaque<T>(T);
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(not(target_pointer_width = "16"))]
type NonZeroIdx = std::num::NonZeroU32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
#[cfg(target_pointer_width = "16")]
type NonZeroIdx = std::num::NonZeroU16;
/// An [interner][interning] for strings.
///
/// The interner allows you to cache strings and instead deal in a simple
/// symbol, which is trivial to compare and small to copy around.
///
/// As opposed to other interners, this interner stores all of the interned
/// strings in a single concatenated string. This reduces allocation space
/// required for the interned strings, as well as fragmentation of the memory
/// held by the interner.
///
/// [interning]: <https://en.wikipedia.org/wiki/String_interning>
#[derive(Debug, Clone, Default)]
pub struct Interner<S = DefaultHashBuilder> {
hasher: S,
string_to_symbol: HashMap<Opaque<Symbol>, (), ()>, // not HashSet for raw_entry API
symbol_to_span: Vec<Span>,
span_to_string: String,
}
/// Capacity specification for an [`Interner`].
#[allow(missing_docs)]
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Hash)]
pub struct Capacity {
pub symbols: usize,
pub bytes: usize,
}
impl<S: BuildHasher + Default> Interner<S> {
/// Creates a new empty interner.
///
/// The default hashing algorithm is hashbrown's: [`AHash`], though this
/// is subject to change at any point in the future. The hash function is
/// very fast for all lengths of strings, but the algorithm is _not_
/// guaranteed to protect against attacks such as HashDoS.
///
/// [`AHash`]: <https://crates.io/crates/ahash>
pub fn new() -> Self {
Default::default()
}
/// Creates an empty interner with the specified capacity.
///
/// The interner should be able to hold `capacity.symbols` distinct symbols
/// or `capacity.bytes` bytes worth of symbols without reallocating,
/// whichever is hit first. In practice, however, the internal hashmap of
/// symbols may potentially reallocate earlier, due to load factors.
pub fn with_capacity(capacity: Capacity) -> Self {
Self::with_hasher_and_capacity(Default::default(), capacity)
}
}
impl<S> Interner<S> {
/// Creates an empty interner which will use the given hash builder.
pub fn with_hasher(hash_builder: S) -> Self {
Interner::with_hasher_and_capacity(hash_builder, Default::default())
}
/// Creates an empty interner with the specified capacity and hash builder.
///
/// The interner should be able to hold `capacity.symbols` distinct symbols
/// or `capacity.bytes` bytes worth of symbols without reallocating,
/// whichever is hit first. In practice, however, the internal hashmap of
/// symbols may potentially reallocate earlier, due to load factors.
pub fn with_hasher_and_capacity(hash_builder: S, capacity: Capacity) -> Self {
Interner {
hasher: hash_builder,
string_to_symbol: HashMap::with_capacity_and_hasher(capacity.symbols, ()),
symbol_to_span: Vec::with_capacity(capacity.symbols),
span_to_string: String::with_capacity(capacity.bytes),
}
}
/// [`with_hasher_and_capacity`][Interner::with_hasher_and_capacity],
/// but with the standard argument order.
///
/// Capacity second is better when using a struct literal to specify it.
pub fn with_capacity_and_hasher(capacity: Capacity, hash_builder: S) -> Self {
Interner::with_hasher_and_capacity(hash_builder, capacity)
}
/// The number of uniquely interned strings.
pub fn len(&self) -> usize {
self.symbol_to_span.len()
}
/// Returns true if the interner has no elements.
pub fn is_empty(&self) -> bool {
self.symbol_to_span.is_empty()
}
/// The number of interned bytes.
pub fn size(&self) -> usize {
self.span_to_string.len()
}
/// Returns the string associated with the given symbol.
pub fn resolve(&self, s: Symbol) -> Option<&str> {
let span = *self.symbol_to_span.get(s.ix())?;
Some(&self.span_to_string[span])
}
}
impl<S> Index<Symbol> for Interner<S> {
type Output = str;
fn index(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.ix()];
&self.span_to_string[span]
}
}
fn insert_substring(string: &mut String, s: &str) -> Span {
// if let Some(index) = string.find(s) {
// Span(index as u32, (index + s.len()) as u32)
// } else {
let start = string.len() as u32;
let end = (string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
string.push_str(s);
Span(start, end)
// }
}
impl<S: BuildHasher> Interner<S> {
/// Interns the given value.
/// Returns a symbol for the string in this interner.
///
/// If the string is already in the interner, this is copy-free.
/// If the string is not yet in the interner, it is copied in.
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
hasher,
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry_mut()
.from_hash(hash, |&Opaque(symbol)| {
&span_to_string[symbol_to_span[symbol.ix()]] == s
});
match entry {
RawEntryMut::Occupied(place) => {
let &Opaque(symbol) = place.key();
symbol
}
RawEntryMut::Vacant(place) => {
let span = insert_substring(span_to_string, s);
symbol_to_span.push(span);
let symbol = Symbol {
raw: NonZeroIdx::new(
symbol_to_span
.len()
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow")),
)
.unwrap(),
};
let (&mut Opaque(symbol), &mut ()) =
place.insert_with_hasher(hash, Opaque(symbol), (), |&Opaque(symbol)| {
hasher.hash(&span_to_string[symbol_to_span[symbol.ix()]])
});
symbol
}
}
}
/// Get the interned symbol for this string,
/// but do not insert it if it is missing.
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
hasher,
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry()
.from_hash(hash, |&Opaque(symbol)| {
&span_to_string[symbol_to_span[symbol.ix()]] == s
});
entry.map(|(&Opaque(symbol), &())| symbol)
}
}
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
#[derive(Copy, Clone)]
pub struct Symbol {
raw: Idx,
}
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
pub struct Interner<S = DefaultHashBuilder> {
string_to_symbol: HashMap<Span, Symbol, S>,
symbol_to_span: Vec<Span>,
span_to_string: String,
}
impl<S: BuildHasher + Clone> Interner<S> {
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
// We have to get an owned copy of the hasher, because otherwise the
// shared reference conflicts with the raw_entry_mut unique reference.
let hasher = string_to_symbol.hasher().clone();
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry_mut()
.from_hash(hash, |&span| &span_to_string[span] == s);
match entry {
RawEntryMut::Occupied(place) => *place.get(),
RawEntryMut::Vacant(place) => {
let start = span_to_string.len() as Idx;
let end = (span_to_string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
let span = Span(start, end);
span_to_string.push_str(s);
let symbol = Symbol {
// This cast is lossless because `string_buffer.len()` will overflow Idx first
// Proof: there is 1 empty string and fewer than 255 1-byte strings.
// Thus with more than 256 strings, the buffer must be longer, QED.
raw: symbol_to_span.len() as Idx,
};
symbol_to_span.push(span);
let (&mut _span, &mut symbol) =
place.insert_with_hasher(hash, span, symbol, |&span| {
hasher.hash(&span_to_string[span])
});
symbol
}
}
}
}
impl<S: BuildHasher> Interner<S> {
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
string_to_symbol,
symbol_to_span: _,
span_to_string,
} = self;
let hash = string_to_symbol.hasher().hash(s);
let entry = string_to_symbol.raw_entry().from_hash(hash, |span| {
&span_to_string[span.0 as usize..span.1 as usize] == s
});
entry.map(|(&_span, &symbol)| symbol)
}
pub fn resolve(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.raw as usize];
&self.span_to_string[span]
}
}
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
#[derive(Copy, Clone)]
pub struct Symbol {
raw: Idx,
}
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
pub struct Interner<S = DefaultHashBuilder> {
string_to_symbol: HashMap<Symbol, (), S>,
symbol_to_span: Vec<Span>,
span_to_string: String,
}
impl<S: BuildHasher + Clone> Interner<S> {
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
// We have to get an owned copy of the hasher, because otherwise the
// shared reference conflicts with the raw_entry_mut unique reference.
let hasher = string_to_symbol.hasher().clone();
let hash = hasher.hash(s);
let entry = string_to_symbol.raw_entry_mut().from_hash(hash, |&symbol| {
&span_to_string[symbol_to_span[symbol.raw as usize]] == s
});
match entry {
RawEntryMut::Occupied(place) => *place.key(),
RawEntryMut::Vacant(place) => {
let start = span_to_string.len() as Idx;
let end = (span_to_string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
let span = Span(start, end);
span_to_string.push_str(s);
let symbol = Symbol {
// This cast is lossless because `string_buffer.len()` will overflow Idx first
// Proof: there is 1 empty string and fewer than 255 1-byte strings.
// Thus with more than 256 strings, the buffer must be longer, QED.
raw: symbol_to_span.len() as Idx,
};
symbol_to_span.push(span);
let (&mut symbol, &mut ()) =
place.insert_with_hasher(hash, symbol, (), |&symbol| {
hasher.hash(&span_to_string[symbol_to_span[symbol.raw as usize]])
});
symbol
}
}
}
}
impl<S: BuildHasher> Interner<S> {
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = string_to_symbol.hasher().hash(s);
let entry = string_to_symbol.raw_entry().from_hash(hash, |&symbol| {
&span_to_string[symbol_to_span[symbol.raw as usize]] == s
});
entry.map(|(&symbol, &())| symbol)
}
pub fn resolve(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.raw as usize];
&self.span_to_string[span]
}
}
@matklad
Copy link

matklad commented Mar 22, 2020

Typo: type Ix = u16;

@CAD97
Copy link
Author

CAD97 commented Mar 22, 2020

Whoops, the typo is fixed now. That's just to be nicer to 16 bit targets, anyway 😛

@bbqsrc
Copy link

bbqsrc commented Jun 2, 2020

@CAD97 did you ever end up publishing this anywhere else? I'd love to use this.

@CAD97
Copy link
Author

CAD97 commented Jul 2, 2020

@bbqsrc Not yet, but I've now polished it up and packaged it in a crate that I'm happy to publish if anyone helps write some tests/examples.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment