Skip to content

Instantly share code, notes, and snippets.

@cgiosy
Last active January 17, 2024 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cgiosy/dda34f4701482f1c6d07fb521d43325f to your computer and use it in GitHub Desktop.
Save cgiosy/dda34f4701482f1c6d07fb521d43325f to your computer and use it in GitHub Desktop.
#define LOAD_FACTOR 0.5
#include "robin-hood.hpp"
#define ERASE_TECHNIQUE USE_SHIFTING
#include "robin-hood.hpp"
#include "random.hpp"
#include "lib/bytell_hash_map.hpp"
#define LOOKUP_ZIPF true
#include <cmath>
#include <cstdint>
#include <cstddef>
#include <cassert>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <iterator>
#include <type_traits>
#include <chrono>
#include <random>
#include <bit>
#include <array>
#include <vector>
#include <variant>
#include <algorithm>
#include <map>
uint64_t chainCnt = 0;
double zeta(double const z, uint64_t const N) {
double n = 0;
for (uint64_t i = 1; i <= N; i++) n += std::pow(i, -z);
return n;
}
#define now() std::chrono::steady_clock::now()
#define ms(a, b) std::chrono::duration<double>(b - a).count() * 1e12
template<class T>
void shuffle(std::vector<T>& v) {
cgiosy::shuffle(cgiosy::wyrand, v.begin(), v.end());
}
template<class Map>
void _bench(std::string TestName, size_t TC, size_t N, size_t M) {
constexpr double Z = 0.75;
// Initialize test data
std::vector<uint64_t> keys, lookups;
keys.reserve(N);
lookups.reserve(M);
auto const first = M / zeta(Z, N);
for (uint64_t i = 0; i < N; i++) {
#if LOOKUP_ZIPF
uint64_t cnt = first * pow(i + 1, -Z) + 0.5;
#else
uint64_t cnt = M / N;
#endif
auto key = cgiosy::wyrand();
keys.push_back(key);
while (cnt--) lookups.push_back(key);
}
auto erasedKeys = keys;
shuffle(keys);
shuffle(lookups);
shuffle(erasedKeys);
std::map<std::string, std::vector<double>> timeTable;
auto measureTime = [&](std::string name, double n, auto f) {
auto const timeStart = now();
f();
auto const elapsed = ms(timeStart, now());
timeTable[name].push_back(elapsed / n);
};
for (uint64_t testCount = 0; testCount < TC; testCount++) {
Map set(N);
#if 0
for (uint64_t i = 0; i < 30; i++) {
measureTime("insert", keys.size(), [&] {
for (auto const& key : keys)
set.emplace(key);
});
assert(set.size() == keys.size());
shuffle(erasedKeys);
measureTime("erase", erasedKeys.size(), [&] {
for (auto const& key : erasedKeys)
set.erase(key);
});
assert(set.size() == 0);
}
measureTime("insert", keys.size(), [&] {
for (auto const& key : keys)
set.emplace(key);
});
#else
measureTime("insert (initial)", keys.size(), [&] {
for (auto const& key : keys)
set.emplace(key);
});
for (uint64_t i = 0; i < 30; i++) {
assert(set.size() == keys.size());
measureTime("erase + insert", erasedKeys.size(), [&] {
for (auto const& key : erasedKeys) {
set.erase(key);
set.emplace(key);
}
});
shuffle(erasedKeys);
}
#endif
assert(set.size() == keys.size());
measureTime("contains (successful)", lookups.size(), [&] {
uint64_t res = 0;
for (auto const& key : lookups)
res += set.count(key);
assert(res == lookups.size());
});
measureTime("contains (unsuccessful)", lookups.size(), [&] {
uint64_t res = 0;
for (auto const& key : lookups)
res += set.count(key ^ 1);
assert(res == 0);
});
measureTime("erase", erasedKeys.size(), [&] {
for (auto const& key : erasedKeys)
set.erase(key);
});
measureTime("insert", erasedKeys.size(), [&] {
for (auto const& key : keys)
set.emplace(key);
});
}
std::cout << "## " << TestName << std::endl;
for (auto [name, timeList] : timeTable) {
std::sort(timeList.begin(), timeList.end());
double sum = 0;
for (auto const& time : timeList)
sum += time;
std::cout << "### " << name << '\n';
std::cout << "- avg: " << sum / timeList.size() << '\n';
std::cout << "- p50: " << timeList[int(timeList.size() * 0.5)] << '\n';
std::cout << "- p99: " << timeList[int(timeList.size() * 0.99)] << '\n';
std::cout << std::endl;
}
std::cout << std::endl;
}
#define bench(T, ...) _bench<T>(#T, __VA_ARGS__)
int main() {
std::ios::sync_with_stdio(0);
std::cin.tie(0);
std::cout << std::fixed << std::setprecision(3);
size_t L = 10, R = 10;
for (size_t n = L; n <= R; n += 5) {
uint64_t TEST_COUNT = 128 << (R - n);
size_t N = (1 << n) * (LOAD_FACTOR * 2 / 3) - 10;
size_t M = N * 5;
double P = 0.;
std::cout << "# " << n << std::endl;
for (size_t i = 0; i < 3; i++) {
bench(ska::bytell_hash_set<uint64_t>, TEST_COUNT, N, M);
bench(HashTable<uint64_t>, TEST_COUNT, N, M);
bench(HashTableShift<uint64_t>, TEST_COUNT, N, M);
}
}
}
#include <cstdint>
#include <memory>
#include <iterator>
#include <algorithm>
#include <random>
#include <array>
namespace cgiosy {
template<int RW = 1, int LOCAL = 2, class T>
inline void prefetch(T& value) noexcept {
__builtin_prefetch(std::addressof(value), RW, LOCAL);
}
template<class Rng>
inline uint64_t random(Rng& rng, uint64_t const n) {
auto m = __uint128_t(rng()) * n;
if (uint64_t(m) < n)
while (uint64_t(m) < -n % n)
m = __uint128_t(rng()) * n;
return m >> 64;
}
template<class Rng, class Iter>
void shuffle(Rng& rng, Iter const a, Iter const b) {
size_t constexpr SZ = 512;
auto const n = std::distance(a, b);
if (n < std::max(SZ * 4, (1 << 20) / sizeof(decltype(*a)))) {
for (auto i = n - 1; i > 0; i -= 1)
std::swap(a[i], a[random(rng, i + 1)]);
return;
}
std::array<uint64_t, SZ> indexes;
for (auto p = SZ - 1; ~p; p -= 1)
prefetch(indexes[p] = random(rng, n + p - (SZ - 1)));
auto p = SZ - 1;
for (auto i = n - 1; i > SZ - 1; i -= 1) {
std::swap(a[i], a[indexes[p]]);
prefetch(indexes[p] = random(rng, i + 1 - SZ));
p = int64_t(p - 1) < 0 ? SZ - 1 : p - 1;
}
for (auto i = SZ - 1; i > 0; i -= 1) {
std::swap(a[i], a[indexes[p]]);
p = int64_t(p - 1) < 0 ? SZ - 1 : p - 1;
}
}
static inline uint64_t wyseed = std::random_device{}();
inline uint64_t wyrand() {
wyseed += 0xA0761D6478BD642F;
auto v = wyseed * __uint128_t(wyseed ^ 0xE7037ED1A0B428DB);
return v ^ v >> 64;
}
};
#include <cmath>
#include <cstdint>
#include <iterator>
#include <variant>
#include <utility>
#include <algorithm>
#ifndef USE_SHIFTING
#define USE_SHIFTING 1
#endif
#ifndef USE_GRAVEYARD
#define USE_GRAVEYARD 2
#endif
#ifndef ERASE_TECHNIQUE
#define ERASE_TECHNIQUE USE_GRAVEYARD
#endif
#ifndef LOAD_FACTOR
#define LOAD_FACTOR 0.5
#endif
#if ERASE_TECHNIQUE == USE_GRAVEYARD
#define HASH_TABLE HashTable
#else
#define HASH_TABLE HashTableShift
#endif
template<class Key, class Value = std::monostate>
struct HASH_TABLE {
#define USE_NONE 0
#define USE_GOLDEN 1
#define USE_WYRAND 2
#define USE_SPLITMIX 3
#define HASH_TYPE USE_GOLDEN
static inline uint64_t hash(uint64_t x) {
#if HASH_TYPE == USE_GOLDEN
// https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/linux/hash.h
x *= 0x61C8864680B583EB;
#elif HASH_TYPE == USE_WYRAND
// https://github.com/wangyi-fudan/wyhash/blob/e77036ac1943369dc03e611cde52a8570f8ceefe/wyhash.h#L149
auto y = x * __uint128_t(x ^ 0xE7037ED1A0B428DB);
x = y ^ y >> 64;
#elif HASH_TYPE == USE_SPLITMIX
// https://xoshiro.di.unimi.it/splitmix64.c
x += 0x9E3779B97F4A7C15;
x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9;
x = (x ^ (x >> 27)) * 0x94D049BB133111EB;
return x ^ (x >> 31);
#endif
return x;
}
union KV {
struct {
Key key;
[[no_unique_address]] Value value;
};
struct {
Key first;
[[no_unique_address]] Value second;
};
};
struct iterator {
using iterator_category = std::bidirectional_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = KV;
using pointer = value_type*;
using reference = value_type&;
uint8_t* const data;
size_t const mask;
size_t pos;
inline reference operator*() const { return ((KV*)data)[pos]; }
inline pointer operator->() const { return &((KV*)data)[pos]; }
inline uint8_t& meta() const {
return data[~pos];
}
inline uint8_t filled() const {
return uint8_t(meta() + 1) >= uint8_t(1 + 1);
}
inline uint8_t isHead() const {
return meta() <= 1;
}
inline iterator& operator++() {
pos = pos + 1 & mask;
return *this;
}
inline iterator& operator--() {
pos = pos - 1 & mask;
return *this;
}
inline bool operator!=(iterator const& b) const {
return pos != b.pos;
};
};
uint8_t* data;
size_t keyCount;
#if ERASE_TECHNIQUE == USE_GRAVEYARD
size_t graveyardCount;
#endif
uint8_t lg;
HASH_TABLE(size_t const reserved = 0) {
lg = reserved ? 64 - __builtin_clzll(size_t(reserved / LOAD_FACTOR)) : 0;
data = new uint8_t[size_t(sizeof(KV) + 1) << lg];
std::fill(data, data + capacity(), 0);
data += capacity();
keyCount = 0;
#if ERASE_TECHNIQUE == USE_GRAVEYARD
graveyardCount = capacity() >> 2;
#endif
}
~HASH_TABLE() {
deallocate();
}
void deallocate() {
data -= capacity();
operator delete[](data, size_t(sizeof(KV) + 1) << lg);
}
#if ERASE_TECHNIQUE == USE_GRAVEYARD
inline void graveyard() {
if (graveyardCount > 0) {
graveyardCount -= 1;
return;
}
for (size_t i = 0; i < capacity(); i += 1)
data[~i] = data[~i] == -1 ? 0 : data[~i];
auto const remain = capacity() - keyCount;
auto const tombPeriod = (capacity() * 2) / remain;
graveyardCount = remain >> 2;
for (size_t i = 0; i < capacity(); i += tombPeriod)
data[~i] = data[~i] == 0 ? -1 : data[~i];
}
#endif
inline size_t capacity() const {
return size_t(1) << lg;
}
inline size_t size() const {
return keyCount;
}
inline bool empty() const {
return size() == 0;
}
template<class Hash>
inline iterator iter(Hash const h) {
auto const pos = h >> (sizeof(Hash) * 8 - lg);
return {data, capacity() - 1, pos};
}
inline iterator find(Key const& key) {
auto it = iter(hash(key));
uint8_t dist = 1;
while (!(it->key == key)) [[unlikely]] {
if (it.meta() < dist) break;
++it;
++dist;
}
return it;
}
inline bool contains(Key const& key) {
return find(key)->key == key;
}
inline size_t count(Key const& key) {
return contains(key);
}
inline std::pair<iterator, bool> emplace(Key key, Value&& value = std::monostate{}) {
auto it = iter(hash(key));
uint8_t dist = 1;
while (true) {
if (!it.filled()) [[likely]] {
it->key = std::move(key);
it->value = std::move(value);
it.meta() = dist;
break;
}
if (it->key == key) return {it, false};
if (it.meta() < dist) {
std::swap(key, it->key);
std::swap(value, it->value);
std::swap(dist, it.meta());
}
++it;
++dist;
}
keyCount += 1;
#if ERASE_TECHNIQUE == USE_GRAVEYARD
graveyard();
#endif
return {it, true};
}
inline std::pair<iterator, bool> insert(KV&& kv) {
emplace(std::move(kv.key), std::move(kv.value));
}
inline void erase(iterator it) {
#if ERASE_TECHNIQUE == USE_GRAVEYARD
it->key = {};
it->value = {};
it.meta() = -1;
keyCount -= 1;
#if ERASE_TECHNIQUE == USE_GRAVEYARD
graveyard();
#endif
#else
auto nxt = it;
++nxt;
while (!nxt.isHead()) {
it->key = std::move(nxt->key);
it->value = std::move(nxt->value);
it.meta() = nxt.meta() - 1;
++it;
++nxt;
}
it->key = {};
it->value = {};
it.meta() = 0;
keyCount -= 1;
#endif
}
inline size_t erase(Key const& key) {
auto it = find(key);
if (!(it->key == key)) return 0;
erase(it);
return 1;
}
};
#undef ERASE_TECHNIQUE
#undef HASH_TABLE
@cgiosy
Copy link
Author

cgiosy commented Aug 1, 2022

## ska::bytell_hash_set<uint64_t>
### contains (successful)
- avg: 12822.412
- p50: 12817.821
- p99: 14496.162

### contains (unsuccessful)
- avg: 13526.027
- p50: 13687.104
- p99: 14144.927

### erase
- avg: 17202.227
- p50: 16119.875
- p99: 28493.487

### erase + insert
- avg: 47483.171
- p50: 47672.297
- p99: 69323.325

### insert
- avg: 31091.913
- p50: 31531.868
- p99: 39056.347

### insert (initial)
- avg: 31371.501
- p50: 31921.397
- p99: 36573.975


## HashTable<uint64_t>
### contains (successful)
- avg: 13332.788
- p50: 13402.067
- p99: 13848.005

### contains (unsuccessful)
- avg: 17588.752
- p50: 17452.184
- p99: 18730.525

### erase
- avg: 16910.118
- p50: 16743.762
- p99: 18496.986

### erase + insert
- avg: 23931.208
- p50: 23718.679
- p99: 32033.790

### insert
- avg: 13877.944
- p50: 13208.506
- p99: 17908.957

### insert (initial)
- avg: 15360.311
- p50: 15377.515
- p99: 16877.253


## HashTableShift<uint64_t>
### contains (successful)
- avg: 14718.095
- p50: 14215.752
- p99: 17123.794

### contains (unsuccessful)
- avg: 15986.431
- p50: 15618.595
- p99: 19044.100

### erase
- avg: 16522.757
- p50: 16280.025
- p99: 17246.916

### erase + insert
- avg: 28228.942
- p50: 27940.718
- p99: 32789.480

### insert
- avg: 12122.362
- p50: 12217.450
- p99: 12359.796

### insert (initial)
- avg: 12391.221
- p50: 12263.743
- p99: 13462.361

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment