Last active
August 20, 2020 10:41
-
-
Save monoid/df4e7fd658613ec5699a4b7c4b6e5abf to your computer and use it in GitHub Desktop.
Concept of interned string pool in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "tests" | |
version = "0.1.0" | |
authors = ["Ivan Boldyrev <lispnik@gmail.com>"] | |
edition = "2018" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
derivative = "1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Translation of Rust pool into C++ */ | |
#include <vector> | |
#include <memory> | |
template<class T> | |
class DumbSet { | |
private: | |
std::vector<std::weak_ptr<T>> bins; | |
public: | |
DumbSet() : bins{} { | |
} | |
std::shared_ptr<T> intern(const T& val) { | |
for (const auto& weak: bins) { | |
std::shared_ptr<T> strong = weak.lock(); | |
// TODO: also check hash before the `==` | |
if (strong && *strong == val) { | |
return strong; | |
} | |
} | |
// Not found: insert new element. | |
std::shared_ptr<T> res = std::make_shared<T>(val); | |
// TODO: insert at free place, if any | |
bins.push_back(std::weak_ptr<T>{res}); | |
return res; | |
} | |
void implant(const std::shared_ptr<T>& val) { | |
for (const auto& weak: bins) { | |
std::shared_ptr<T> strong = weak.lock(); | |
// TODO: also check hash before the `==` | |
if (strong && *strong == *val) { | |
return; | |
} | |
} | |
// TODO: insert at free place, if any | |
bins.push_back(std::weak_ptr<T>{val}); | |
} | |
}; | |
#include <string> | |
#include <iostream> | |
int main() { | |
DumbSet<std::string> set; | |
auto v1 = set.intern("test"); | |
auto v2 = set.intern("test"); | |
auto v3 = set.intern("test2"); | |
std::cout << (v1.get() == v2.get()) << std::endl | |
<< (v1.get() == v3.get()) << std::endl; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Conceptual implementation of interned weak string pool. | |
/// | |
/// The pool interns &str, and returns Rc<PooledStr>>, and PooledStr derefs to &str. | |
/// | |
/// This simple implementation uses very dumb "hashset" that just uses | |
/// vector. | |
// depends on derivative = "1" | |
use derivative::Derivative; | |
use std::rc::{Rc, Weak}; | |
use std::boxed::Box; | |
use std::marker::PhantomData; | |
use std::ops::Deref; | |
use std::hash::{BuildHasher, Hash, Hasher}; | |
use std::collections::hash_map::RandomState; | |
// I wish it was in std | |
type HashValue = u64; | |
#[derive(Clone)] | |
#[derive(Debug)] | |
#[derive(Derivative)] | |
#[derivative(PartialEq)] | |
pub struct PooledStr<BH: BuildHasher> { | |
hash: HashValue, | |
value: String, // just use ready container | |
#[derivative(PartialEq="ignore")] | |
_build_hasher: PhantomData<BH>, | |
} | |
impl<BH: BuildHasher> PartialEq<str> for PooledStr<BH> { | |
fn eq(&self, other: &str) -> bool { | |
self.value == other | |
} | |
} | |
impl<BH: BuildHasher> Deref for PooledStr<BH> { | |
type Target = str; | |
fn deref(&self) -> &Self::Target { | |
self.value.deref() | |
} | |
} | |
pub trait FromHashableSource<V: ?Sized, BH: BuildHasher> { | |
fn from_ref(v: &V, hasher: &mut BH::Hasher) -> Self; | |
fn from_hashed(hash: HashValue, v: &V) -> Self; | |
fn get_hash(&self) -> HashValue; | |
} | |
impl<BH: BuildHasher> FromHashableSource<str, BH> for PooledStr<BH> { | |
fn from_ref(val: &str, hasher: &mut BH::Hasher) -> PooledStr<BH> { | |
val.hash(hasher); | |
let hash = hasher.finish(); | |
PooledStr { | |
hash: hash, | |
value: String::from(val), | |
_build_hasher: PhantomData, | |
} | |
} | |
fn from_hashed(hash: HashValue, val: &str) -> PooledStr<BH> { | |
PooledStr { | |
hash: hash, | |
value: String::from(val), | |
_build_hasher: PhantomData, | |
} | |
} | |
fn get_hash(&self) -> HashValue { | |
self.hash | |
} | |
} | |
// Dumb HashSet that has only linear probing | |
pub struct DumbSet<T, K: ?Sized, BH: BuildHasher> { | |
bins: Vec<Weak<T>>, | |
builder_hash: BH, | |
_phantom: PhantomData<K>, | |
} | |
impl<T, K: ?Sized, BH> DumbSet<T, K, BH> | |
where T: PartialEq<K> + PartialEq + Clone + FromHashableSource<K, BH>, | |
K: Hash, | |
BH: BuildHasher{ | |
/// Create new hash; use same builder_hash if you want to | |
/// reuse interned values from one pool in another. | |
pub fn from_builder(builder_hash: BH) -> Self { | |
Self { | |
bins: Vec::new(), | |
builder_hash: builder_hash, | |
_phantom: PhantomData, | |
} | |
} | |
/// Intern key value (e.g. &str), returning new or old interned object | |
pub fn intern(&mut self, key: &K) -> Rc<T> { | |
let mut hasher = self.builder_hash.build_hasher(); | |
key.hash(&mut hasher); | |
let hash = hasher.finish(); | |
// Yep, it is very dumb. | |
for wc in self.bins.iter() { | |
if let Some(rc) = wc.upgrade() { | |
if rc.get_hash() == hash && rc.deref() == key { | |
return rc; | |
} | |
} | |
} | |
// Not found | |
let newval = Rc::new(T::from_hashed(hash, key)); | |
self.bins.push(Rc::downgrade(&newval)); | |
newval | |
} | |
/// Add another interned value, e.g. from another pool. Or even | |
/// reintern string from same pool. | |
pub fn implant(&mut self, val: &Rc<T>) { | |
for wc in self.bins.iter() { | |
if let Some(rc) = wc.upgrade() { | |
if rc.deref() == val.deref() { | |
return; | |
} | |
} | |
} | |
// Not found. Very dumb implementation: we do not even look | |
// for free week cell. | |
self.bins.push(Rc::downgrade(val)); | |
} | |
// So far I have no idea how to do better without Box<dyn ...> | |
pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item=Rc<T>> + 'a> { | |
Box::new(self.bins.iter().filter_map(|wc| wc.upgrade())) | |
} | |
} | |
pub type Pool<BH> = DumbSet<PooledStr<BH>, str, BH>; | |
fn main() { | |
let random_builder = RandomState::new(); | |
let mut pool = Pool::from_builder(random_builder); | |
let mut interns = Vec::new(); | |
for w in String::from("Mary had a little lamb, a little lamb, a little lamb.").split_whitespace() { | |
interns.push(pool.intern(w)); | |
} | |
for interned in pool.iter() { | |
println!("{:?}", interned.deref()); | |
} | |
} | |
/* | |
Output: | |
Mary | |
had | |
a | |
little | |
lamb, | |
lamb. | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment