-
-
Save anonymous/3b0b489137af9006d5c498f10d42514a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::collections::HashMap; | |
use std::hash::{Hasher, SipHasher}; | |
use std::io::{self, stdout, Write}; | |
use std::{mem, ptr, fs}; | |
use std::time::Instant; | |
const DICT: &'static [u8] = include_bytes!("<dictionary here>"); | |
/// A DJB2 hasher. | |
pub struct Djb2 { | |
state: u64, | |
} | |
impl Default for Djb2 { | |
fn default() -> Djb2 { | |
Djb2 { | |
state: 5381, | |
} | |
} | |
} | |
impl Hasher for Djb2 { | |
fn finish(&self) -> u64 { | |
self.state | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for &b in bytes { | |
// s * 35 + b = s' (mod 2^64) | |
self.state = (self.state << 5).wrapping_add(self.state).wrapping_add(b as u64); | |
} | |
} | |
} | |
/// A modified (for better performance) DJB2 hasher. | |
/// | |
/// It has slightly more collisions than DJB2, but has a much better performance, due to reading | |
/// from the buffer in 8-byte (u64) chunks. | |
pub struct LongHasher { | |
state: u64, | |
k0: u64, | |
k1: u64, | |
} | |
impl Default for LongHasher { | |
fn default() -> LongHasher { | |
LongHasher { | |
state: 0x1ED00D7E1E707EB2, | |
k0: 0x953779B185EBCA84, | |
k1: 0xABF72BEEA3A1BDE3, | |
} | |
} | |
} | |
impl Hasher for LongHasher { | |
fn finish(&self) -> u64 { | |
self.state ^ self.k1 | |
} | |
#[inline] | |
fn write_u64(&mut self, n: u64) { | |
self.state = self.state.wrapping_mul(0x1FFFFFFFFFFFFFFF).wrapping_add(n).wrapping_mul(0x9E3779B185EBCA87).rotate_right(33) ^ self.k0; | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for b in 0..bytes.len() / 8 { | |
self.write_u64(*unsafe { mem::transmute::<_, &u64>(&bytes[b * 8]) }); | |
} | |
let mut extra = 0xABF72BEEA3B1BDEF; | |
unsafe { | |
let copy = bytes.len() % 8; | |
let last = bytes.len().saturating_sub(copy); | |
ptr::copy_nonoverlapping( | |
bytes[last..].as_ptr(), | |
&mut extra as *mut u64 as *mut u8, | |
copy, | |
); | |
} | |
self.write_u64(extra); | |
} | |
} | |
#[derive(Default)] | |
struct SimpleSum { | |
state: u64 | |
} | |
impl Hasher for SimpleSum { | |
fn finish(&self) -> u64 { | |
self.state | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for b in 0..bytes.len() { | |
self.state = self.state.wrapping_add(bytes[b] as u64); | |
} | |
} | |
} | |
#[derive(Default)] | |
struct SimpleSumXor { | |
state: u64 | |
} | |
impl Hasher for SimpleSumXor { | |
fn finish(&self) -> u64 { | |
self.state | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for b in 0..bytes.len() { | |
self.state = 0xDEADFAAA ^ self.state.wrapping_add(bytes[b] as u64); | |
} | |
} | |
} | |
#[derive(Default)] | |
struct SlidingSum { | |
state: u64 | |
} | |
impl Hasher for SlidingSum { | |
fn finish(&self) -> u64 { | |
self.state | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for b in bytes { | |
self.state = (self.state << 6).wrapping_add(self.state << 16).wrapping_sub(self.state).wrapping_add(*b as u64); | |
} | |
} | |
} | |
#[derive(Default)] | |
struct Sdbm { | |
state: u32 | |
} | |
impl Hasher for Sdbm { | |
fn finish(&self) -> u64 { | |
self.state as u64 | |
} | |
fn write(&mut self, bytes: &[u8]) { | |
for b in bytes { | |
self.state = (*b as u32).wrapping_add(self.state << 6).wrapping_add(self.state << 16).wrapping_sub(self.state); | |
} | |
} | |
} | |
// TODO remove | |
/* | |
fn mb(s: u64, c: u64) -> u64 { | |
let words = DICT.split(|&x| x == b'\n'); | |
let mut buckets = HashMap::new(); | |
for i in words.take(2048 * 4) { | |
let mut hasher: LongHasher = LongHasher { | |
state: s, | |
c: c, | |
}; | |
hasher.write(i); | |
let hash = hasher.finish() % 2048; | |
if let Some(x) = buckets.get_mut(&hash) { | |
*x += 1; | |
continue; | |
} | |
buckets.insert(hash, 1); | |
} | |
let mut collisions = 0; | |
let mut max_bucket = 0; | |
for (_, &i) in buckets.iter() { | |
if i > 0 { | |
collisions += 1; | |
if max_bucket < i { | |
max_bucket = i; | |
} | |
} | |
} | |
max_bucket | |
} | |
*/ | |
fn number_of_collisions<'a, T: Hasher + Default, I: Iterator<Item = &'a [u8]>>(iter: I) -> (u64, u64) { | |
let mut buckets = HashMap::new(); | |
for i in iter { | |
let mut hasher: T = Default::default(); | |
hasher.write(i); | |
let hash = hasher.finish() % 2048; | |
if let Some(x) = buckets.get_mut(&(hash ^ 0xABABABAB)) { | |
*x += 1; | |
continue; | |
} | |
buckets.insert(hash ^ 0xABABABAB, 1); | |
} | |
let mut collisions = 0; | |
let mut max_bucket = 0; | |
for (_, &i) in buckets.iter() { | |
if i > 0 { | |
collisions += 1; | |
if max_bucket < i { | |
max_bucket = i; | |
} | |
} | |
} | |
(collisions, max_bucket) | |
} | |
fn bench_hasher<T: Hasher + Default>(name: &str) { | |
let time = Instant::now(); | |
let mut stdout = stdout(); | |
let words = DICT.split(|&x| x == b'\n'); | |
let (collisions_w, max_bucket_w) = number_of_collisions::<T, _>(words); | |
let collisions = collisions_w; //+ collisions_u; | |
let max_bucket = max_bucket_w; //+ max_bucket_u; | |
let ela = time.elapsed(); | |
stdout.write(b"~ ").unwrap(); | |
stdout.write(name.as_bytes()).unwrap(); | |
stdout.write(b"\n").unwrap(); | |
stdout.write(b" Filled buckets: ").unwrap(); | |
stdout.write(collisions.to_string().as_bytes()).unwrap(); | |
stdout.write(b"\n").unwrap(); | |
stdout.write(b" Max bucket: ").unwrap(); | |
stdout.write(max_bucket.to_string().as_bytes()).unwrap(); | |
stdout.write(b"\n").unwrap(); | |
stdout.write(b" Time: ").unwrap(); | |
stdout.write(ela.as_secs().to_string().as_bytes()).unwrap(); | |
stdout.write(b" s. ").unwrap(); | |
stdout.write((ela.subsec_nanos() / 1000).to_string().as_bytes()).unwrap(); | |
stdout.write(" ms.\n".as_bytes()).unwrap(); | |
let mut hasher = T::default(); | |
let time = Instant::now(); | |
for i in 0..1000000000 / 10 { | |
hasher.write_u64(i); | |
} | |
let mut null = fs::File::open("/dev/null").unwrap(); | |
null.write(&[hasher.finish() as u8]); | |
let ela = time.elapsed(); | |
let count = 10.0 / 8.0 / (ela.subsec_nanos() as f64 / 1_000_000_000.0 + ela.as_secs() as f64); | |
stdout.write(b" GB/s: ").unwrap(); | |
stdout.write(count.to_string().as_bytes()).unwrap(); | |
stdout.write(b"\n").unwrap(); | |
} | |
fn main() { | |
bench_hasher::<Djb2>("DJB2"); | |
bench_hasher::<SipHasher>("SipHasher"); | |
bench_hasher::<LongHasher>("LongHasher"); | |
bench_hasher::<SimpleSum>("SimpleSum"); | |
bench_hasher::<SimpleSumXor>("SimpleSumXor"); | |
bench_hasher::<SlidingSum>("SlidingSum+"); | |
bench_hasher::<Sdbm>("Sdbm"); | |
/* | |
let mut stdout = stdout(); | |
let mut record = mb(37, 35); | |
for i in 500000..5000000 { | |
for j in 10..50 { | |
let mbi = mb(i, j); | |
if mbi < record { | |
record = mbi; | |
stdout.write(b"i: "); | |
stdout.write(i.to_string().as_bytes()); | |
stdout.write(b" j: "); | |
stdout.write(j.to_string().as_bytes()); | |
stdout.write(b" res: "); | |
stdout.write(mbi.to_string().as_bytes()); | |
stdout.write(b"\n"); | |
} | |
} | |
} | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment