Created
January 7, 2018 23:03
-
-
Save apmckinlay/fab919d5c2b49825533f5117bd17939a to your computer and use it in GitHub Desktop.
hash table implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
/* | |
* Hmap and Hset are hash based maps and sets | |
* implemented with: prime table sizes, open addressing, linear probing, | |
* robin hood hashing, and resize on probe limit. | |
* Htbl is the common code, it is not intended for external use. | |
* To avoid padding it uses an array of blocks each with BN (4) entries. | |
* But we still treat it as a single flat array. | |
* Within blocks keys, values, and distances are separate arrays to avoid padding. | |
* Uses uint16 for size so limited to 64k slots or about 48k elements. | |
* Keys and values are assumed to be small and so are passed by value. | |
* They are also moved by simple assignment | |
* so should not have complex constructors or assignment operators. | |
* NOTE: intended for use with garbage collection - does not run destructors. | |
* Hashing and key equality functions can be specified with a HfEq type | |
* or more "globally" by defining hashfn and keyeq for your type. | |
* Allows lookup by a different type from the key, | |
* as long as there are compatible hashfn(T) and keyeq(T, Key) | |
* See: https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/ | |
*/ | |
#include "except.h" | |
#include <utility> | |
#include <stdint.h> | |
using std::make_pair; | |
class Ostream; | |
struct HfEq | |
{ | |
template <typename Key> | |
static size_t hashfn(Key k) | |
{ return ::hashfn(k); } | |
template <typename X, typename Y> | |
static bool keyeq(X x, Y y) | |
{ return ::keyeq(x, y); } | |
}; | |
inline size_t hashfn(int k) | |
{ return k; } | |
inline bool keyeq(int x, int y) | |
{ return x == y; } | |
const int BN = 8; // number of entries per block, 8 should ensure no padding | |
static_assert((BN & (BN - 1)) == 0); // should be power of two | |
// internal base class for Hset and Hmap | |
template <typename Block, typename Key, typename Val, typename HE = HfEq> | |
class Htbl | |
{ | |
static_assert(sizeof(Block) <= BN * 17); // limit inline size | |
protected: | |
Htbl() : blocks(nullptr), isize(0) | |
{} | |
Htbl(int cap) | |
{ | |
cap = cap + cap / 8; // probelimit will usually hit first | |
verify(cap < primes[nprimes - 1]); | |
for (isize = 0; primes[isize] < cap; ++isize) | |
{ } | |
init(); | |
} | |
public: | |
// delete the key, returns true if found, false if not | |
bool del(Key k) | |
{ | |
if (!siz) | |
return false; | |
int i = index_for_key(k); | |
for (int d = 0; d <= distance(i); ++d, ++i) | |
if (HE::keyeq(k, key(i))) | |
{ | |
// shift following elements while it improves placement | |
for (; distance(i + 1) > 0; ++i) | |
pull(i); // move i+1 to i | |
set_distance(i, EMPTY); | |
set_key(i, {}); // help garbage collection | |
--siz; | |
return true; | |
} | |
return false; // not found | |
} | |
int size() | |
{ | |
return siz; | |
} | |
class iterator | |
{ | |
public: | |
iterator(Htbl* h, int i_, int n_) : htbl(h), i(i_), n(n_) | |
{} | |
auto operator*() | |
{ return htbl->key(i); } | |
iterator& operator++() | |
{ | |
for (++i; i < n; ++i) | |
if (htbl->distance(i) != EMPTY) | |
break; | |
return *this; | |
} | |
bool operator==(const iterator& other) const | |
{ return htbl == other.htbl && i == other.i; } | |
protected: | |
Htbl* htbl; | |
int i; | |
int n; | |
}; | |
iterator begin() | |
{ | |
iterator x(this, -1, nslots()); | |
return ++x; | |
} | |
iterator end() | |
{ | |
return iterator(this, nslots(), nslots()); | |
} | |
protected: | |
void init() | |
{ | |
verify(isize < nprimes); | |
blocks = new Block[nblocks()]; | |
} | |
int nslots() | |
{ | |
// allocate an extra probelimit slots so we don't need bounds checks | |
return primes[isize] + probelimit(); | |
} | |
int nblocks() | |
{ | |
return ((nslots() - 1) / BN) + 1; | |
} | |
void insert(Key k, Val v, bool overwrite) | |
{ | |
if (!blocks) | |
init(); | |
if (siz >= primes[isize] - primes[isize] / 8) | |
grow("load"); | |
for (;;) | |
{ | |
int i = index_for_key(k); | |
for (int d = 0; d < probelimit(); ++d, ++i) | |
{ | |
if (distance(i) == EMPTY) | |
{ | |
set_distance(i, d); | |
set_key(i, k); | |
set_val(i, v); | |
++siz; | |
return ; | |
} | |
if (HE::keyeq(k, key(i))) | |
{ | |
if (overwrite) | |
set_val(i, v); | |
return ; // found existing | |
} | |
// else collision | |
if (distance(i) < d) | |
swap(i, d, k, v); | |
} | |
grow("limit"); | |
} | |
} | |
// swap [i] and d,k,v | |
void swap(int i, int& d, Key& k, Val& v) | |
{ | |
int dtmp = distance(i); | |
set_distance(i, d); | |
d = dtmp; | |
Key ktmp = key(i); | |
set_key(i, k); | |
k = ktmp; | |
Val vtmp = val(i); | |
set_val(i, v); | |
v = vtmp; | |
} | |
// if key is found, returns index, else -1 | |
template <typename T> | |
int find(T k) | |
{ | |
if (!siz) | |
return -1; | |
int i = index_for_key(k); | |
for (int d = 0; d <= distance(i); ++d, ++i) | |
if (HE::keyeq(k, key(i))) | |
return i; | |
return -1; | |
} | |
// move [i+1] to [i] - used by delete | |
void pull(int i) | |
{ | |
set_distance(i, distance(i + 1) - 1); // distance decreases | |
set_key(i, key(i + 1)); | |
set_val(i, val(i + 1)); | |
} | |
void grow(const char* which) | |
{ | |
// check that we're not growing too much | |
verify(siz > primes[isize] / 4); | |
Block* oldblocks = blocks; | |
int oldnb = nblocks(); | |
++isize; | |
init(); | |
int oldsiz = siz; | |
siz = 0; | |
for (int b = 0; b < oldnb; ++b) | |
{ | |
Block& blk = oldblocks[b]; | |
for (int i = 0; i < BN; ++i) | |
if (blk.distance[i] != 0) // NOTE: unadjusted so empty is 0 | |
{ | |
fastput(blk.keys[i], blk.val(i)); | |
} | |
} | |
verify(siz == oldsiz); | |
memset(oldblocks, 0, oldnb * sizeof (Block)); // help garbage collection | |
} | |
void fastput(Key k, Val v) | |
{ | |
// should be the same as the core of insert | |
// but no duplicate checks and no growing | |
int i = index_for_key(k); | |
for (int d = 0; d < probelimit(); ++d, ++i) | |
{ | |
if (distance(i) == EMPTY) | |
{ | |
set_distance(i, d); | |
set_key(i, k); | |
set_val(i, v); | |
++siz; | |
return ; | |
} | |
if (distance(i) < d) | |
swap(i, d, k, v); | |
} | |
error("grow failed"); | |
} | |
// distance is stored offset by 1 so zeroed data is EMPTY (-1) | |
int8_t distance(int i) | |
{ return blocks[i / BN].distance[i % BN] - 1; } | |
void set_distance(int i, int8_t d) | |
{ blocks[i / BN].distance[i % BN] = d + 1; } | |
Key& key(int i) | |
{ return blocks[i / BN].keys[i % BN]; } | |
void set_key(int i, Key k) | |
{ blocks[i / BN].keys[i % BN] = k; } | |
Val val(int i) | |
{ return blocks[i / BN].val(i % BN); } | |
void set_val(int i, Val v) | |
{ return blocks[i / BN].set_val(i % BN, v); } | |
int probelimit() | |
{ return isize <= 1 ? 3 : isize + 4; } // log2(n) | |
static constexpr const int primes[] = { | |
16 - 3, // 13 + 3 (probelimit) = 16 | |
32 + 5, // 37 + 3 (probelimit) = 40 | |
64 - 3, | |
128 - 1, | |
256 + 1, | |
512 - 3, | |
1024 - 3, | |
2048 + 5, | |
4096 - 3, | |
8092 - 1, | |
16384 - 3, | |
32768 + 3, | |
65536 - 15 | |
}; | |
template <typename T> | |
int index_for_key(T k) | |
{ | |
size_t h = HE::hashfn(k); | |
// return h % primes[isize]; // BUT % is slow, % of constant is faster | |
switch (isize) | |
{ | |
case 0: return h % primes[0]; | |
case 1: return h % primes[1]; | |
case 2: return h % primes[2]; | |
case 3: return h % primes[3]; | |
case 4: return h % primes[4]; | |
case 5: return h % primes[5]; | |
case 6: return h % primes[6]; | |
case 7: return h % primes[7]; | |
case 8: return h % primes[8]; | |
case 9: return h % primes[9]; | |
case 10: return h % primes[10]; | |
case 11: return h % primes[11]; | |
case 12: return h % primes[12]; | |
default: unreachable(); | |
} | |
} | |
static const int nprimes = sizeof primes / sizeof(int); | |
static const int8_t EMPTY = -1; | |
Block* blocks; | |
uint16_t isize; | |
uint16_t siz = 0; // current number of items | |
}; | |
static_assert(sizeof(Htbl<int, int, int > ) == 8); | |
template <typename K> | |
struct HsetBlock | |
{ | |
bool val(int) | |
{ return true; } | |
void set_val(int, bool) | |
{} | |
int8_t distance[BN]; | |
K keys[BN]; | |
}; | |
// specifies bool for value, but val/set_val are nop | |
template <typename K, typename HE = HfEq> | |
class Hset : public Htbl<HsetBlock<K>, K, bool, HE> | |
{ | |
public: | |
using Tbl = Htbl<HsetBlock<K>, K, bool, HE>; | |
Hset() = default; | |
Hset(int cap) : Tbl(cap) | |
{} | |
template <typename T> | |
K* get(T k) | |
{ | |
int i = find(k); | |
return i < 0 ? nullptr : &key(i); | |
} | |
void put(K k) | |
{ insert(k, true, false); } | |
}; | |
template <typename K> | |
Ostream& operator<<(Ostream& os, Hset<K>& hset) | |
{ | |
os << "{"; | |
auto sep = ""; | |
for (auto k : hset) | |
{ | |
os << sep << k; | |
sep = ", "; | |
} | |
os << "}"; | |
return os; | |
} | |
template <typename K, typename V> | |
struct HmapBlock | |
{ | |
// ReSharper disable once Cpp | |
V val(int i) | |
{ return vals[i]; } | |
void set_val(int i, V v) | |
{ vals[i] = v; } | |
int8_t distance[BN]; | |
K keys[BN]; | |
V vals[BN]; | |
}; | |
template <typename K, typename V, typename HE = HfEq> | |
class Hmap : public Htbl<HmapBlock<K, V>, K, V, HE> | |
{ | |
public: | |
using Tbl = Htbl<HmapBlock<K, V>, K, V, HE>; | |
using Iter = Tbl::iterator; | |
Hmap() = default; | |
Hmap(int cap) : Tbl(cap) | |
{} | |
template <typename T> | |
V* get(T k) | |
{ | |
int i = find(k); | |
return i < 0 ? nullptr : &valref(i); | |
} | |
V& operator[](K k) | |
{ | |
insert(k, {}, false); | |
return *get(k); | |
} | |
void put(K k, V v) | |
{ | |
insert(k, v, true); | |
} | |
V& valref(int i) | |
{ return blocks[i / BN].vals[i % BN]; } | |
class iterator : public Iter | |
{ | |
public: | |
iterator(typename Iter it) : Iter(it) | |
{} | |
auto operator*() | |
{ return make_pair(Iter::operator*(), ((Hmap*) htbl)->val(i)); } | |
}; | |
iterator begin() | |
{ return iterator(Tbl::begin()); } | |
iterator end() | |
{ return iterator(Tbl::end()); } | |
}; | |
template <typename K, typename V> | |
Ostream& operator<<(Ostream& os, Hmap<K,V>& hmap) | |
{ | |
os << "{"; | |
auto sep = ""; | |
for (auto[k, v] : hmap) | |
{ | |
os << sep << k << ": " << v; | |
sep = ", "; | |
} | |
os << "}"; | |
return os; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment