Skip to content

Instantly share code, notes, and snippets.

@CAD97 CAD97/complete_lib.rs Secret
Last active May 9, 2020

Embed
What would you like to do?
raw_entry string interner
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Debug, Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
/// An interned string.
///
/// A `Symbol` is only as big as `u32` or `usize`, whichever is smaller.
/// For most platforms, that means `Symbol` is a `u32`.
///
/// Additionally, `Option<Symbol>` and `Symbol` have the same size.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
#[repr(transparent)]
pub struct Symbol {
raw: NonZeroIdx,
}
impl Symbol {
/// Convert the symbol into a `usize` appropriate for indexing side tables.
///
/// The indexes are continuous, starting at 0.
pub fn ix(self) -> usize {
self.raw.get() as usize - 1
}
}
#[derive(Debug, Copy, Clone)]
struct Opaque<T>(T);
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(not(target_pointer_width = "16"))]
type NonZeroIdx = std::num::NonZeroU32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
#[cfg(target_pointer_width = "16")]
type NonZeroIdx = std::num::NonZeroU16;
/// An [interner][interning] for strings.
///
/// The interner allows you to cache strings and instead deal in a simple
/// symbol, which is trivial to compare and small to copy around.
///
/// As opposed to other interners, this interner stores all of the interned
/// strings in a single concatenated string. This reduces allocation space
/// required for the interned strings, as well as fragmentation of the memory
/// held by the interner.
///
/// [interning]: <https://en.wikipedia.org/wiki/String_interning>
#[derive(Debug, Clone, Default)]
pub struct Interner<S = DefaultHashBuilder> {
hasher: S,
string_to_symbol: HashMap<Opaque<Symbol>, (), ()>, // not HashSet for raw_entry API
symbol_to_span: Vec<Span>,
span_to_string: String,
}
/// Capacity specification for an [`Interner`].
#[allow(missing_docs)]
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Hash)]
pub struct Capacity {
pub symbols: usize,
pub bytes: usize,
}
impl<S: BuildHasher + Default> Interner<S> {
/// Creates a new empty interner.
///
/// The default hashing algorithm is hashbrown's: [`AHash`], though this
/// is subject to change at any point in the future. The hash function is
/// very fast for all lengths of strings, but the algorithm is _not_
/// guaranteed to protect against attacks such as HashDoS.
///
/// [`AHash`]: <https://crates.io/crates/ahash>
pub fn new() -> Self {
Default::default()
}
/// Creates an empty interner with the specified capacity.
///
/// The interner should be able to hold `capacity.symbols` distinct symbols
/// or `capacity.bytes` bytes worth of symbols without reallocating,
/// whichever is hit first. In practice, however, the internal hashmap of
/// symbols may potentially reallocate earlier, due to load factors.
pub fn with_capacity(capacity: Capacity) -> Self {
Self::with_hasher_and_capacity(Default::default(), capacity)
}
}
impl<S> Interner<S> {
/// Creates an empty interner which will use the given hash builder.
pub fn with_hasher(hash_builder: S) -> Self {
Interner::with_hasher_and_capacity(hash_builder, Default::default())
}
/// Creates an empty interner with the specified capacity and hash builder.
///
/// The interner should be able to hold `capacity.symbols` distinct symbols
/// or `capacity.bytes` bytes worth of symbols without reallocating,
/// whichever is hit first. In practice, however, the internal hashmap of
/// symbols may potentially reallocate earlier, due to load factors.
pub fn with_hasher_and_capacity(hash_builder: S, capacity: Capacity) -> Self {
Interner {
hasher: hash_builder,
string_to_symbol: HashMap::with_capacity_and_hasher(capacity.symbols, ()),
symbol_to_span: Vec::with_capacity(capacity.symbols),
span_to_string: String::with_capacity(capacity.bytes),
}
}
/// [`with_hasher_and_capacity`][Interner::with_hasher_and_capacity],
/// but with the standard argument order.
///
/// Capacity second is better when using a struct literal to specify it.
pub fn with_capacity_and_hasher(capacity: Capacity, hash_builder: S) -> Self {
Interner::with_hasher_and_capacity(hash_builder, capacity)
}
/// The number of uniquely interned strings.
pub fn len(&self) -> usize {
self.symbol_to_span.len()
}
/// Returns true if the interner has no elements.
pub fn is_empty(&self) -> bool {
self.symbol_to_span.is_empty()
}
/// The number of interned bytes.
pub fn size(&self) -> usize {
self.span_to_string.len()
}
/// Returns the string associated with the given symbol.
pub fn resolve(&self, s: Symbol) -> Option<&str> {
let span = *self.symbol_to_span.get(s.ix())?;
Some(&self.span_to_string[span])
}
}
impl<S> Index<Symbol> for Interner<S> {
type Output = str;
fn index(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.ix()];
&self.span_to_string[span]
}
}
fn insert_substring(string: &mut String, s: &str) -> Span {
// if let Some(index) = string.find(s) {
// Span(index as u32, (index + s.len()) as u32)
// } else {
let start = string.len() as u32;
let end = (string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
string.push_str(s);
Span(start, end)
// }
}
impl<S: BuildHasher> Interner<S> {
/// Interns the given value.
/// Returns a symbol for the string in this interner.
///
/// If the string is already in the interner, this is copy-free.
/// If the string is not yet in the interner, it is copied in.
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
hasher,
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry_mut()
.from_hash(hash, |&Opaque(symbol)| {
&span_to_string[symbol_to_span[symbol.ix()]] == s
});
match entry {
RawEntryMut::Occupied(place) => {
let &Opaque(symbol) = place.key();
symbol
}
RawEntryMut::Vacant(place) => {
let span = insert_substring(span_to_string, s);
symbol_to_span.push(span);
let symbol = Symbol {
raw: NonZeroIdx::new(
symbol_to_span
.len()
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow")),
)
.unwrap(),
};
let (&mut Opaque(symbol), &mut ()) =
place.insert_with_hasher(hash, Opaque(symbol), (), |&Opaque(symbol)| {
hasher.hash(&span_to_string[symbol_to_span[symbol.ix()]])
});
symbol
}
}
}
/// Get the interned symbol for this string,
/// but do not insert it if it is missing.
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
hasher,
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry()
.from_hash(hash, |&Opaque(symbol)| {
&span_to_string[symbol_to_span[symbol.ix()]] == s
});
entry.map(|(&Opaque(symbol), &())| symbol)
}
}
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
#[derive(Copy, Clone)]
pub struct Symbol {
raw: Idx,
}
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
pub struct Interner<S = DefaultHashBuilder> {
string_to_symbol: HashMap<Span, Symbol, S>,
symbol_to_span: Vec<Span>,
span_to_string: String,
}
impl<S: BuildHasher + Clone> Interner<S> {
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
// We have to get an owned copy of the hasher, because otherwise the
// shared reference conflicts with the raw_entry_mut unique reference.
let hasher = string_to_symbol.hasher().clone();
let hash = hasher.hash(s);
let entry = string_to_symbol
.raw_entry_mut()
.from_hash(hash, |&span| &span_to_string[span] == s);
match entry {
RawEntryMut::Occupied(place) => *place.get(),
RawEntryMut::Vacant(place) => {
let start = span_to_string.len() as Idx;
let end = (span_to_string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
let span = Span(start, end);
span_to_string.push_str(s);
let symbol = Symbol {
// This cast is lossless because `string_buffer.len()` will overflow Idx first
// Proof: there is 1 empty string and fewer than 255 1-byte strings.
// Thus with more than 256 strings, the buffer must be longer, QED.
raw: symbol_to_span.len() as Idx,
};
symbol_to_span.push(span);
let (&mut _span, &mut symbol) =
place.insert_with_hasher(hash, span, symbol, |&span| {
hasher.hash(&span_to_string[span])
});
symbol
}
}
}
}
impl<S: BuildHasher> Interner<S> {
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
string_to_symbol,
symbol_to_span: _,
span_to_string,
} = self;
let hash = string_to_symbol.hasher().hash(s);
let entry = string_to_symbol.raw_entry().from_hash(hash, |span| {
&span_to_string[span.0 as usize..span.1 as usize] == s
});
entry.map(|(&_span, &symbol)| symbol)
}
pub fn resolve(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.raw as usize];
&self.span_to_string[span]
}
}
#![forbid(unsafe_code)] // 🎉
use {
hashbrown::hash_map::{DefaultHashBuilder, HashMap, RawEntryMut},
std::{
convert::TryInto,
hash::{BuildHasher, Hash, Hasher},
ops::Index,
},
};
trait BuildHasherExt {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64;
}
impl<S: BuildHasher> BuildHasherExt for S {
fn hash<T: ?Sized + Hash>(&self, it: &T) -> u64 {
let mut hasher = self.build_hasher();
it.hash(&mut hasher);
hasher.finish()
}
}
#[derive(Copy, Clone)]
struct Span(Idx, Idx);
impl Index<Span> for str {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
impl Index<Span> for String {
type Output = str;
fn index(&self, span: Span) -> &Self::Output {
&self[span.0 as usize..span.1 as usize]
}
}
#[derive(Copy, Clone)]
pub struct Symbol {
raw: Idx,
}
#[cfg(not(target_pointer_width = "16"))]
type Idx = u32;
#[cfg(target_pointer_width = "16")]
type Idx = u16;
pub struct Interner<S = DefaultHashBuilder> {
string_to_symbol: HashMap<Symbol, (), S>,
symbol_to_span: Vec<Span>,
span_to_string: String,
}
impl<S: BuildHasher + Clone> Interner<S> {
pub fn get_or_insert(&mut self, s: &str) -> Symbol {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
// We have to get an owned copy of the hasher, because otherwise the
// shared reference conflicts with the raw_entry_mut unique reference.
let hasher = string_to_symbol.hasher().clone();
let hash = hasher.hash(s);
let entry = string_to_symbol.raw_entry_mut().from_hash(hash, |&symbol| {
&span_to_string[symbol_to_span[symbol.raw as usize]] == s
});
match entry {
RawEntryMut::Occupied(place) => *place.key(),
RawEntryMut::Vacant(place) => {
let start = span_to_string.len() as Idx;
let end = (span_to_string.len() + s.len())
.try_into()
.unwrap_or_else(|_| panic!("Interner overflow"));
let span = Span(start, end);
span_to_string.push_str(s);
let symbol = Symbol {
// This cast is lossless because `string_buffer.len()` will overflow Idx first
// Proof: there is 1 empty string and fewer than 255 1-byte strings.
// Thus with more than 256 strings, the buffer must be longer, QED.
raw: symbol_to_span.len() as Idx,
};
symbol_to_span.push(span);
let (&mut symbol, &mut ()) =
place.insert_with_hasher(hash, symbol, (), |&symbol| {
hasher.hash(&span_to_string[symbol_to_span[symbol.raw as usize]])
});
symbol
}
}
}
}
impl<S: BuildHasher> Interner<S> {
pub fn get(&self, s: &str) -> Option<Symbol> {
let Interner {
string_to_symbol,
symbol_to_span,
span_to_string,
} = self;
let hash = string_to_symbol.hasher().hash(s);
let entry = string_to_symbol.raw_entry().from_hash(hash, |&symbol| {
&span_to_string[symbol_to_span[symbol.raw as usize]] == s
});
entry.map(|(&symbol, &())| symbol)
}
pub fn resolve(&self, s: Symbol) -> &str {
let span = self.symbol_to_span[s.raw as usize];
&self.span_to_string[span]
}
}
@matklad

This comment has been minimized.

Copy link

matklad commented Mar 22, 2020

Typo: type Ix = u16;

@CAD97

This comment has been minimized.

Copy link
Owner Author

CAD97 commented Mar 22, 2020

Whoops, the typo is fixed now. That's just to be nicer to 16 bit targets, anyway 😛

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.