Skip to content

Instantly share code, notes, and snippets.

@matklad
Created December 9, 2016 16:12
Show Gist options
  • Save matklad/3fa22b8a2550c5e2c338f05ecc7bdea4 to your computer and use it in GitHub Desktop.
Save matklad/3fa22b8a2550c5e2c338f05ecc7bdea4 to your computer and use it in GitHub Desktop.
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <ctime>
#include <forward_list>
#include <iostream>
#include <list>
#include <unordered_set>
#include <vector>
// Better: power of two and a proper hash function.
const size_t capacity = 1'000'007; // Yay C++14!!
const double load_factor = 0.9;
struct std_us {
std_us() : data_() { data_.reserve(capacity); }
void add(uint32_t x) { data_.insert(x); }
bool contains(uint32_t x) { return data_.find(x) != end(data_); }
private:
std::unordered_set<uint32_t> data_;
};
struct chaining {
chaining() : data_(capacity, std::forward_list<uint32_t>()) {}
void add(uint32_t x) {
size_t idx = hash(x);
// duplicates :(
data_[idx].push_front(x);
}
bool contains(uint32_t x) {
auto const &bucket = data_[hash(x)];
return std::find(begin(bucket), end(bucket), x) != end(bucket);
}
private:
// forward_list -> vector = 3x as much space.
// Can steal one element though...
std::vector<std::forward_list<uint32_t>> data_;
size_t hash(uint32_t x) { return x % capacity; }
};
struct chaining2h {
chaining2h() : data_(capacity, std::list<uint32_t>()) {}
void add(uint32_t x) {
auto &bucket1 = data_[hash1(x)];
auto &bucket2 = data_[hash1(x)];
(bucket1.size() < bucket2.size() ? bucket1 : bucket2).push_front(x);
}
bool contains(uint32_t x) {
return bucket_contains(data_[hash1(x)], x) ||
bucket_contains(data_[hash2(x)], x);
}
private:
bool bucket_contains(std::list<uint32_t> const &bucket, uint32_t x) {
return std::find(begin(bucket), end(bucket), x) != end(bucket);
}
std::vector<std::list<uint32_t>> data_;
size_t hash1(uint32_t x) { return x % capacity; }
size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
};
struct linear_probing {
linear_probing() : data_(capacity, EMPTY) {}
void add(uint32_t x) { data_[insertion_index(x)] = x; }
bool contains(uint32_t x) { return data_[insertion_index(x)] == x; }
private:
const uint32_t EMPTY = 0;
const uint32_t TOMBSTONE = 1;
std::vector<uint32_t> data_;
size_t insertion_index(uint32_t x) {
assert(x != EMPTY && x != TOMBSTONE); // :(
auto index = hash(x);
while (data_[index] != 0 && data_[index] != x) {
index = (index + 1) % capacity;
}
return index;
}
size_t hash(uint32_t x) { return x % capacity; }
};
struct robin_hood {
robin_hood() : data_(capacity, EMPTY) {}
void add(uint32_t x) {
assert(x != EMPTY && x != TOMBSTONE); // :(
auto index = hash(x);
size_t curr_probe = 0;
while (data_[index] != 0 && data_[index] != x) {
auto existing_probe = probe_dist(index);
if (existing_probe < curr_probe) {
std::swap(x, data_[index]);
curr_probe = existing_probe;
}
index = (index + 1) % capacity;
++curr_probe;
}
data_[index] = x;
}
bool contains(uint32_t x) {
assert(x != EMPTY && x != TOMBSTONE); // :(
auto index = hash(x);
size_t curr_probe = 0;
while (data_[index] != 0 && data_[index] != x) {
auto existing_probe = probe_dist(index);
if (existing_probe < curr_probe) {
return false; // better than plain linear probing
}
index = (index + 1) % capacity;
++curr_probe;
}
return data_[index] == x;
}
private:
const uint32_t EMPTY = 0;
const uint32_t TOMBSTONE = 1;
std::vector<uint32_t> data_;
size_t hash(uint32_t x) { return x % capacity; }
size_t probe_dist(size_t index) {
auto ideal_index = hash(data_[index]);
return (index + capacity - ideal_index) % capacity;
}
};
struct cuckoo {
const uint32_t EMPTY = 0;
void add(uint32_t x) {
for (;;) {
assert(x != EMPTY);
auto index = hash1(x);
std::swap(x, data1_[index]);
if (x == EMPTY)
return;
index = hash2(x);
std::swap(x, data2_[index]);
if (x == EMPTY)
return;
}
}
bool contains(uint32_t x) {
assert(x != EMPTY);
return data1_[hash1(x)] == x || data2_[hash2(x)] == x;
}
cuckoo() : data1_(capacity, EMPTY), data2_(capacity, EMPTY) {}
private:
std::vector<uint32_t> data1_;
std::vector<uint32_t> data2_;
size_t hash1(uint32_t x) { return x % capacity; }
size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
};
// use std::chrono
clock_t tick_begin;
void tick() { tick_begin = clock(); }
double tack() { return (clock() - tick_begin); }
std::vector<uint32_t> random_numbers() {
// use proper C++ random numbers!
std::vector<uint32_t> result;
for (size_t i = 0; i < capacity * load_factor; i++) {
result.push_back(rand());
}
return result;
}
template <typename T>
void bench(std::vector<uint32_t> const &to_add,
std::vector<uint32_t> const &to_lookup) {
srand(time(nullptr));
T map;
tick();
for (auto x : to_add) {
map.add(x);
}
std::cout << "insertion " << tack() << std::endl;
tick();
for (auto x : to_add) {
if (!map.contains(x)) {
std::cout << "oups :(" << std::endl;
}
}
std::cout << "successful lookup " << tack() << std::endl;
tick();
uint32_t num_failures = 0;
for (auto x : to_lookup) {
if (!map.contains(x)) {
num_failures += 1;
}
}
std::cout << "failed lookup " << tack() << std::endl;
std::cout << "failures " << num_failures << std::endl;
}
int main() {
auto to_add = random_numbers();
auto to_lookup = random_numbers();
std::cout << "std::unordered_set" << std::endl;
bench<std_us>(to_add, to_lookup);
std::cout << "\nchaining" << std::endl;
bench<chaining>(to_add, to_lookup);
std::cout << "\nchaining two hashes" << std::endl;
bench<chaining2h>(to_add, to_lookup);
std::cout << "\nlinear probing" << std::endl;
bench<linear_probing>(to_add, to_lookup);
std::cout << "\nrobin hood" << std::endl;
bench<robin_hood>(to_add, to_lookup);
// std::cout << "\ncuckoo" << std::endl;
// bench<cuckoo>(to_add, to_lookup);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment