Skip to content

Instantly share code, notes, and snippets.

@monoid
Last active August 20, 2020 10:41
Show Gist options
  • Save monoid/df4e7fd658613ec5699a4b7c4b6e5abf to your computer and use it in GitHub Desktop.
Save monoid/df4e7fd658613ec5699a4b7c4b6e5abf to your computer and use it in GitHub Desktop.
Concept of interned string pool in Rust
[package]
name = "tests"
version = "0.1.0"
authors = ["Ivan Boldyrev <lispnik@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
derivative = "1
/** Translation of Rust pool into C++ */
#include <vector>
#include <memory>
template<class T>
class DumbSet {
private:
std::vector<std::weak_ptr<T>> bins;
public:
DumbSet() : bins{} {
}
std::shared_ptr<T> intern(const T& val) {
for (const auto& weak: bins) {
std::shared_ptr<T> strong = weak.lock();
// TODO: also check hash before the `==`
if (strong && *strong == val) {
return strong;
}
}
// Not found: insert new element.
std::shared_ptr<T> res = std::make_shared<T>(val);
// TODO: insert at free place, if any
bins.push_back(std::weak_ptr<T>{res});
return res;
}
void implant(const std::shared_ptr<T>& val) {
for (const auto& weak: bins) {
std::shared_ptr<T> strong = weak.lock();
// TODO: also check hash before the `==`
if (strong && *strong == *val) {
return;
}
}
// TODO: insert at free place, if any
bins.push_back(std::weak_ptr<T>{val});
}
};
#include <string>
#include <iostream>
int main() {
DumbSet<std::string> set;
auto v1 = set.intern("test");
auto v2 = set.intern("test");
auto v3 = set.intern("test2");
std::cout << (v1.get() == v2.get()) << std::endl
<< (v1.get() == v3.get()) << std::endl;
return 0;
}
/// Conceptual implementation of interned weak string pool.
///
/// The pool interns &str, and returns Rc<PooledStr>>, and PooledStr derefs to &str.
///
/// This simple implementation uses very dumb "hashset" that just uses
/// vector.
// depends on derivative = "1"
use derivative::Derivative;
use std::rc::{Rc, Weak};
use std::boxed::Box;
use std::marker::PhantomData;
use std::ops::Deref;
use std::hash::{BuildHasher, Hash, Hasher};
use std::collections::hash_map::RandomState;
// I wish it was in std
type HashValue = u64;
#[derive(Clone)]
#[derive(Debug)]
#[derive(Derivative)]
#[derivative(PartialEq)]
pub struct PooledStr<BH: BuildHasher> {
hash: HashValue,
value: String, // just use ready container
#[derivative(PartialEq="ignore")]
_build_hasher: PhantomData<BH>,
}
impl<BH: BuildHasher> PartialEq<str> for PooledStr<BH> {
fn eq(&self, other: &str) -> bool {
self.value == other
}
}
impl<BH: BuildHasher> Deref for PooledStr<BH> {
type Target = str;
fn deref(&self) -> &Self::Target {
self.value.deref()
}
}
pub trait FromHashableSource<V: ?Sized, BH: BuildHasher> {
fn from_ref(v: &V, hasher: &mut BH::Hasher) -> Self;
fn from_hashed(hash: HashValue, v: &V) -> Self;
fn get_hash(&self) -> HashValue;
}
impl<BH: BuildHasher> FromHashableSource<str, BH> for PooledStr<BH> {
fn from_ref(val: &str, hasher: &mut BH::Hasher) -> PooledStr<BH> {
val.hash(hasher);
let hash = hasher.finish();
PooledStr {
hash: hash,
value: String::from(val),
_build_hasher: PhantomData,
}
}
fn from_hashed(hash: HashValue, val: &str) -> PooledStr<BH> {
PooledStr {
hash: hash,
value: String::from(val),
_build_hasher: PhantomData,
}
}
fn get_hash(&self) -> HashValue {
self.hash
}
}
// Dumb HashSet that has only linear probing
pub struct DumbSet<T, K: ?Sized, BH: BuildHasher> {
bins: Vec<Weak<T>>,
builder_hash: BH,
_phantom: PhantomData<K>,
}
impl<T, K: ?Sized, BH> DumbSet<T, K, BH>
where T: PartialEq<K> + PartialEq + Clone + FromHashableSource<K, BH>,
K: Hash,
BH: BuildHasher{
/// Create new hash; use same builder_hash if you want to
/// reuse interned values from one pool in another.
pub fn from_builder(builder_hash: BH) -> Self {
Self {
bins: Vec::new(),
builder_hash: builder_hash,
_phantom: PhantomData,
}
}
/// Intern key value (e.g. &str), returning new or old interned object
pub fn intern(&mut self, key: &K) -> Rc<T> {
let mut hasher = self.builder_hash.build_hasher();
key.hash(&mut hasher);
let hash = hasher.finish();
// Yep, it is very dumb.
for wc in self.bins.iter() {
if let Some(rc) = wc.upgrade() {
if rc.get_hash() == hash && rc.deref() == key {
return rc;
}
}
}
// Not found
let newval = Rc::new(T::from_hashed(hash, key));
self.bins.push(Rc::downgrade(&newval));
newval
}
/// Add another interned value, e.g. from another pool. Or even
/// reintern string from same pool.
pub fn implant(&mut self, val: &Rc<T>) {
for wc in self.bins.iter() {
if let Some(rc) = wc.upgrade() {
if rc.deref() == val.deref() {
return;
}
}
}
// Not found. Very dumb implementation: we do not even look
// for free week cell.
self.bins.push(Rc::downgrade(val));
}
// So far I have no idea how to do better without Box<dyn ...>
pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item=Rc<T>> + 'a> {
Box::new(self.bins.iter().filter_map(|wc| wc.upgrade()))
}
}
pub type Pool<BH> = DumbSet<PooledStr<BH>, str, BH>;
fn main() {
let random_builder = RandomState::new();
let mut pool = Pool::from_builder(random_builder);
let mut interns = Vec::new();
for w in String::from("Mary had a little lamb, a little lamb, a little lamb.").split_whitespace() {
interns.push(pool.intern(w));
}
for interned in pool.iter() {
println!("{:?}", interned.deref());
}
}
/*
Output:
Mary
had
a
little
lamb,
lamb.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment