Skip to content

Instantly share code, notes, and snippets.

@apmckinlay
Created January 7, 2018 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apmckinlay/fab919d5c2b49825533f5117bd17939a to your computer and use it in GitHub Desktop.
Save apmckinlay/fab919d5c2b49825533f5117bd17939a to your computer and use it in GitHub Desktop.
hash table implementation
#pragma once
/*
* Hmap and Hset are hash based maps and sets
* implemented with: prime table sizes, open addressing, linear probing,
* robin hood hashing, and resize on probe limit.
* Htbl is the common code, it is not intended for external use.
* To avoid padding it uses an array of blocks each with BN (4) entries.
* But we still treat it as a single flat array.
* Within blocks keys, values, and distances are separate arrays to avoid padding.
* Uses uint16 for size so limited to 64k slots or about 48k elements.
* Keys and values are assumed to be small and so are passed by value.
* They are also moved by simple assignment
* so should not have complex constructors or assignment operators.
* NOTE: intended for use with garbage collection - does not run destructors.
* Hashing and key equality functions can be specified with a HfEq type
* or more "globally" by defining hashfn and keyeq for your type.
* Allows lookup by a different type from the key,
* as long as there are compatible hashfn(T) and keyeq(T, Key)
* See: https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/
*/
#include "except.h"
#include <utility>
#include <stdint.h>
using std::make_pair;
class Ostream;
struct HfEq
{
template <typename Key>
static size_t hashfn(Key k)
{ return ::hashfn(k); }
template <typename X, typename Y>
static bool keyeq(X x, Y y)
{ return ::keyeq(x, y); }
};
inline size_t hashfn(int k)
{ return k; }
inline bool keyeq(int x, int y)
{ return x == y; }
const int BN = 8; // number of entries per block, 8 should ensure no padding
static_assert((BN & (BN - 1)) == 0); // should be power of two
// internal base class for Hset and Hmap
template <typename Block, typename Key, typename Val, typename HE = HfEq>
class Htbl
{
static_assert(sizeof(Block) <= BN * 17); // limit inline size
protected:
Htbl() : blocks(nullptr), isize(0)
{}
Htbl(int cap)
{
cap = cap + cap / 8; // probelimit will usually hit first
verify(cap < primes[nprimes - 1]);
for (isize = 0; primes[isize] < cap; ++isize)
{ }
init();
}
public:
// delete the key, returns true if found, false if not
bool del(Key k)
{
if (!siz)
return false;
int i = index_for_key(k);
for (int d = 0; d <= distance(i); ++d, ++i)
if (HE::keyeq(k, key(i)))
{
// shift following elements while it improves placement
for (; distance(i + 1) > 0; ++i)
pull(i); // move i+1 to i
set_distance(i, EMPTY);
set_key(i, {}); // help garbage collection
--siz;
return true;
}
return false; // not found
}
int size()
{
return siz;
}
class iterator
{
public:
iterator(Htbl* h, int i_, int n_) : htbl(h), i(i_), n(n_)
{}
auto operator*()
{ return htbl->key(i); }
iterator& operator++()
{
for (++i; i < n; ++i)
if (htbl->distance(i) != EMPTY)
break;
return *this;
}
bool operator==(const iterator& other) const
{ return htbl == other.htbl && i == other.i; }
protected:
Htbl* htbl;
int i;
int n;
};
iterator begin()
{
iterator x(this, -1, nslots());
return ++x;
}
iterator end()
{
return iterator(this, nslots(), nslots());
}
protected:
void init()
{
verify(isize < nprimes);
blocks = new Block[nblocks()];
}
int nslots()
{
// allocate an extra probelimit slots so we don't need bounds checks
return primes[isize] + probelimit();
}
int nblocks()
{
return ((nslots() - 1) / BN) + 1;
}
void insert(Key k, Val v, bool overwrite)
{
if (!blocks)
init();
if (siz >= primes[isize] - primes[isize] / 8)
grow("load");
for (;;)
{
int i = index_for_key(k);
for (int d = 0; d < probelimit(); ++d, ++i)
{
if (distance(i) == EMPTY)
{
set_distance(i, d);
set_key(i, k);
set_val(i, v);
++siz;
return ;
}
if (HE::keyeq(k, key(i)))
{
if (overwrite)
set_val(i, v);
return ; // found existing
}
// else collision
if (distance(i) < d)
swap(i, d, k, v);
}
grow("limit");
}
}
// swap [i] and d,k,v
void swap(int i, int& d, Key& k, Val& v)
{
int dtmp = distance(i);
set_distance(i, d);
d = dtmp;
Key ktmp = key(i);
set_key(i, k);
k = ktmp;
Val vtmp = val(i);
set_val(i, v);
v = vtmp;
}
// if key is found, returns index, else -1
template <typename T>
int find(T k)
{
if (!siz)
return -1;
int i = index_for_key(k);
for (int d = 0; d <= distance(i); ++d, ++i)
if (HE::keyeq(k, key(i)))
return i;
return -1;
}
// move [i+1] to [i] - used by delete
void pull(int i)
{
set_distance(i, distance(i + 1) - 1); // distance decreases
set_key(i, key(i + 1));
set_val(i, val(i + 1));
}
void grow(const char* which)
{
// check that we're not growing too much
verify(siz > primes[isize] / 4);
Block* oldblocks = blocks;
int oldnb = nblocks();
++isize;
init();
int oldsiz = siz;
siz = 0;
for (int b = 0; b < oldnb; ++b)
{
Block& blk = oldblocks[b];
for (int i = 0; i < BN; ++i)
if (blk.distance[i] != 0) // NOTE: unadjusted so empty is 0
{
fastput(blk.keys[i], blk.val(i));
}
}
verify(siz == oldsiz);
memset(oldblocks, 0, oldnb * sizeof (Block)); // help garbage collection
}
void fastput(Key k, Val v)
{
// should be the same as the core of insert
// but no duplicate checks and no growing
int i = index_for_key(k);
for (int d = 0; d < probelimit(); ++d, ++i)
{
if (distance(i) == EMPTY)
{
set_distance(i, d);
set_key(i, k);
set_val(i, v);
++siz;
return ;
}
if (distance(i) < d)
swap(i, d, k, v);
}
error("grow failed");
}
// distance is stored offset by 1 so zeroed data is EMPTY (-1)
int8_t distance(int i)
{ return blocks[i / BN].distance[i % BN] - 1; }
void set_distance(int i, int8_t d)
{ blocks[i / BN].distance[i % BN] = d + 1; }
Key& key(int i)
{ return blocks[i / BN].keys[i % BN]; }
void set_key(int i, Key k)
{ blocks[i / BN].keys[i % BN] = k; }
Val val(int i)
{ return blocks[i / BN].val(i % BN); }
void set_val(int i, Val v)
{ return blocks[i / BN].set_val(i % BN, v); }
int probelimit()
{ return isize <= 1 ? 3 : isize + 4; } // log2(n)
static constexpr const int primes[] = {
16 - 3, // 13 + 3 (probelimit) = 16
32 + 5, // 37 + 3 (probelimit) = 40
64 - 3,
128 - 1,
256 + 1,
512 - 3,
1024 - 3,
2048 + 5,
4096 - 3,
8092 - 1,
16384 - 3,
32768 + 3,
65536 - 15
};
template <typename T>
int index_for_key(T k)
{
size_t h = HE::hashfn(k);
// return h % primes[isize]; // BUT % is slow, % of constant is faster
switch (isize)
{
case 0: return h % primes[0];
case 1: return h % primes[1];
case 2: return h % primes[2];
case 3: return h % primes[3];
case 4: return h % primes[4];
case 5: return h % primes[5];
case 6: return h % primes[6];
case 7: return h % primes[7];
case 8: return h % primes[8];
case 9: return h % primes[9];
case 10: return h % primes[10];
case 11: return h % primes[11];
case 12: return h % primes[12];
default: unreachable();
}
}
static const int nprimes = sizeof primes / sizeof(int);
static const int8_t EMPTY = -1;
Block* blocks;
uint16_t isize;
uint16_t siz = 0; // current number of items
};
static_assert(sizeof(Htbl<int, int, int > ) == 8);
template <typename K>
struct HsetBlock
{
bool val(int)
{ return true; }
void set_val(int, bool)
{}
int8_t distance[BN];
K keys[BN];
};
// specifies bool for value, but val/set_val are nop
template <typename K, typename HE = HfEq>
class Hset : public Htbl<HsetBlock<K>, K, bool, HE>
{
public:
using Tbl = Htbl<HsetBlock<K>, K, bool, HE>;
Hset() = default;
Hset(int cap) : Tbl(cap)
{}
template <typename T>
K* get(T k)
{
int i = find(k);
return i < 0 ? nullptr : &key(i);
}
void put(K k)
{ insert(k, true, false); }
};
template <typename K>
Ostream& operator<<(Ostream& os, Hset<K>& hset)
{
os << "{";
auto sep = "";
for (auto k : hset)
{
os << sep << k;
sep = ", ";
}
os << "}";
return os;
}
template <typename K, typename V>
struct HmapBlock
{
// ReSharper disable once Cpp
V val(int i)
{ return vals[i]; }
void set_val(int i, V v)
{ vals[i] = v; }
int8_t distance[BN];
K keys[BN];
V vals[BN];
};
template <typename K, typename V, typename HE = HfEq>
class Hmap : public Htbl<HmapBlock<K, V>, K, V, HE>
{
public:
using Tbl = Htbl<HmapBlock<K, V>, K, V, HE>;
using Iter = Tbl::iterator;
Hmap() = default;
Hmap(int cap) : Tbl(cap)
{}
template <typename T>
V* get(T k)
{
int i = find(k);
return i < 0 ? nullptr : &valref(i);
}
V& operator[](K k)
{
insert(k, {}, false);
return *get(k);
}
void put(K k, V v)
{
insert(k, v, true);
}
V& valref(int i)
{ return blocks[i / BN].vals[i % BN]; }
class iterator : public Iter
{
public:
iterator(typename Iter it) : Iter(it)
{}
auto operator*()
{ return make_pair(Iter::operator*(), ((Hmap*) htbl)->val(i)); }
};
iterator begin()
{ return iterator(Tbl::begin()); }
iterator end()
{ return iterator(Tbl::end()); }
};
template <typename K, typename V>
Ostream& operator<<(Ostream& os, Hmap<K,V>& hmap)
{
os << "{";
auto sep = "";
for (auto[k, v] : hmap)
{
os << sep << k << ": " << v;
sep = ", ";
}
os << "}";
return os;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment