Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
// The two sweetspots are 8-bit and 4-bit tags. In the latter case you can fit 14 32-bit keys in
// a cacheline-sized bucket and still have one leftover byte for metadata.
// As for how to choose tags for particular problems, Google's Swiss table and Facebook's F14 table
// both use hash bits that weren't used for the bucket indexing. This is ideal from an entropy perspective
// but it can be better to use some particular feature of the key that you'd otherwise check against anyway.
// For example, for 32-bit keys (with a usable sentinel value) you could use the 8 low bits as the tag
// while storing the remaining 24 bits in the rest of the bucket; that fits 16 keys per bucket. Or if the keys
// are strings you could store the length as the discriminator: with an 8-bit tag, 0 means an empty slot,
// 1..254 means a string of that length, and 255 means a string of length 255 or longer. With a 4-bit tag
// you'd be able to perfectly discriminate string lengths of length 1..14 which covers 90% of identifiers
// in most programs, so that could be ideal for name interning in a compiler.
typedef __m128i uint8x16_t;
#if TAG_BITS == 8
uint8x16_t unpack(void *ptr) {
return _mm_loadu_epi8(ptr);
#elif TAG_BITS == 4
uint8x16_t unpack(void *ptr) {
_m128i tags = _mm_loadu_epi8(ptr);
_m128i tags0 = _mm_and_si128(tags, _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
_m128i tags1 = _mm_and_si128(_mm_srli_si128(tags, 4), _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
return _mm_unpacklo_epi8(tags0, tags1);
#error Tag must have 4 or 8 bits.
uint16_t match(uint8x16_t tags, uint8_t tag) {
return _mm_movemask_epi8(_mm_cmpeq_epi8(tags, _mm_set1_epi8(tag)));
uint16_t rotate(uint16_t mask, uint32_t i) {
return (mask >> i) | (mask << (BUCKET_KEYS - i));
uint16_t next_match(uint16_t mask) {
return mask & (mask - 1);
uint16_t match_index(uint16_t mask) {
return __builtin_ctz(mask);
#pragma align(64)
typedef struct {
uint8_t tags[16 / (8 / TAG_BITS)];
Key keys[BUCKET_KEYS]; // BUCKET_KEYS must be <= 16
} Bucket;
typedef struct {
Bucket *buckets;
uint32_t mask;
} Table;
INLINE // Inline the scalar fast path and allow the vectorized probe loop to be outlined.
int get(Table *table, uint32_t hash, uint8_t tag, Key key) {
uint32_t i = hash & table->mask, k = hash >> 28;
if (k >= BUCKET_KEYS) k -= BUCKET_KEYS; // One reduction is enough when k has clog2(BUCKET_KEYS) bits.
Bucket *b = &table->buckets[i];
// Scalar probe to reduce the latency for the hole-in-one case. If the tag isn't redundant you need to check that too.
if (likely(b->keys[k] == key)) return k + i * BUCKET_KEYS;
return probe(table, b, tag, key, i, k);
int probe(Table *table, Bucket *b, uint8_t tag, Key key, uint32_t i, uint32_t k) {
uint32_t d = 1;
for (;;) {
uint8x16_t t = unpack(b->tags);
// Intra-bucket linear probing with vectorized tag filtering.
for (uint32_t m = rotate(match(t, tag), k); likely(m != 0); m = next_match(m)) {
uint32_t j = match_index(m) + k;
if (unlikely(j >= BUCKET_KEYS)) {
// Handle wrap-around by tail duplication and loop rotation.
uint32_t k2 = k - BUCKET_KEYS;
for (;;) {
if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
m = next_match(m);
if (unlikely(m == 0)) goto not_found;
j = match_index(m) + k2;
if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
if (likely(match(t, EMPTY))) return -1;
i = (i + d++) & table->mask; // Inter-bucket triangular probing.
b = &table->buckets[i];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment