pervognsen/simd_bucket_hash_discrim.c

## simd_bucket_hash_discrim.c
// The two sweetspots are 8-bit and 4-bit tags. In the latter case you can fit 14 32-bit keys in
// a cacheline-sized bucket and still have one leftover byte for metadata.

// As for how to choose tags for particular problems, Google's Swiss table and Facebook's F14 table
// both use hash bits that weren't used for the bucket indexing. This is ideal from an entropy perspective
// but it can be better to use some particular feature of the key that you'd otherwise check against anyway.
// For example, for 32-bit keys (with a usable sentinel value) you could use the 8 low bits as the tag
// while storing the remaining 24 bits in the rest of the bucket; that fits 16 keys per bucket. Or if the keys
// are strings you could store the length as the discriminator: with an 8-bit tag, 0 means an empty slot,
// 1..254 means a string of that length, and 255 means a string of length 255 or longer. With a 4-bit tag
// you'd be able to perfectly discriminate string lengths of length 1..14 which covers 90% of identifiers
// in most programs, so that could be ideal for name interning in a compiler.

typedef __m128i uint8x16_t;

#if TAG_BITS == 8
uint8x16_t unpack(void *ptr) {
    return _mm_loadu_epi8(ptr);
}
#elif TAG_BITS == 4
uint8x16_t unpack(void *ptr) {
    _m128i tags = _mm_loadu_epi8(ptr);
    _m128i tags0 = _mm_and_si128(tags, _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
    _m128i tags1 = _mm_and_si128(_mm_srli_si128(tags, 4), _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
    return _mm_unpacklo_epi8(tags0, tags1);
}
#else
#error Tag must have 4 or 8 bits.
#endif

uint16_t match(uint8x16_t tags, uint8_t tag) {
    return _mm_movemask_epi8(_mm_cmpeq_epi8(tags, _mm_set1_epi8(tag)));
}

uint16_t rotate(uint16_t mask, uint32_t i) {
    return (mask >> i) | (mask << (BUCKET_KEYS - i));
}

uint16_t next_match(uint16_t mask) {
    return mask & (mask - 1);
}

uint16_t match_index(uint16_t mask) {
    return __builtin_ctz(mask);
}

#pragma align(64)
typedef struct {
    uint8_t tags[16 / (8 / TAG_BITS)];
    Key keys[BUCKET_KEYS]; // BUCKET_KEYS must be <= 16
} Bucket;

typedef struct {
    Bucket *buckets;
    uint32_t mask;
} Table;

INLINE // Inline the scalar fast path and allow the vectorized probe loop to be outlined.
int get(Table *table, uint32_t hash, uint8_t tag, Key key) {
    uint32_t i = hash & table->mask, k = hash >> 28;
    if (k >= BUCKET_KEYS) k -= BUCKET_KEYS; // One reduction is enough when k has clog2(BUCKET_KEYS) bits.
    Bucket *b = &table->buckets[i];
    // Scalar probe to reduce the latency for the hole-in-one case. If the tag isn't redundant you need to check that too.
    if (likely(b->keys[k] == key)) return k + i * BUCKET_KEYS;
    return probe(table, b, tag, key, i, k);
}

int probe(Table *table, Bucket *b, uint8_t tag, Key key, uint32_t i, uint32_t k) {
    uint32_t d = 1;
    for (;;) {
        uint8x16_t t = unpack(b->tags);
        // Intra-bucket linear probing with vectorized tag filtering.
        for (uint32_t m = rotate(match(t, tag), k); likely(m != 0); m = next_match(m)) {
            uint32_t j = match_index(m) + k;
            if (unlikely(j >= BUCKET_KEYS)) {
                // Handle wrap-around by tail duplication and loop rotation.
                j -= BUCKET_KEYS;
                uint32_t k2 = k - BUCKET_KEYS;
                for (;;) {
                    if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
                    m = next_match(m);
                    if (unlikely(m == 0)) goto not_found;
                    j = match_index(m) + k2;
                }
            }
           if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
        }
        not_found:
        if (likely(match(t, EMPTY))) return -1;
        i = (i + d++) & table->mask; // Inter-bucket triangular probing.
        b = &table->buckets[i];
    }
}
	// The two sweetspots are 8-bit and 4-bit tags. In the latter case you can fit 14 32-bit keys in
	// a cacheline-sized bucket and still have one leftover byte for metadata.

	// As for how to choose tags for particular problems, Google's Swiss table and Facebook's F14 table
	// both use hash bits that weren't used for the bucket indexing. This is ideal from an entropy perspective
	// but it can be better to use some particular feature of the key that you'd otherwise check against anyway.
	// For example, for 32-bit keys (with a usable sentinel value) you could use the 8 low bits as the tag
	// while storing the remaining 24 bits in the rest of the bucket; that fits 16 keys per bucket. Or if the keys
	// are strings you could store the length as the discriminator: with an 8-bit tag, 0 means an empty slot,
	// 1..254 means a string of that length, and 255 means a string of length 255 or longer. With a 4-bit tag
	// you'd be able to perfectly discriminate string lengths of length 1..14 which covers 90% of identifiers
	// in most programs, so that could be ideal for name interning in a compiler.

	typedef __m128i uint8x16_t;

	#if TAG_BITS == 8
	uint8x16_t unpack(void *ptr) {
	return _mm_loadu_epi8(ptr);
	}
	#elif TAG_BITS == 4
	uint8x16_t unpack(void *ptr) {
	_m128i tags = _mm_loadu_epi8(ptr);
	_m128i tags0 = _mm_and_si128(tags, _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
	_m128i tags1 = _mm_and_si128(_mm_srli_si128(tags, 4), _mm_set_epi64x(0, 0x0F0F0F0F0F0F0F0Full));
	return _mm_unpacklo_epi8(tags0, tags1);
	}
	#else
	#error Tag must have 4 or 8 bits.
	#endif

	uint16_t match(uint8x16_t tags, uint8_t tag) {
	return _mm_movemask_epi8(_mm_cmpeq_epi8(tags, _mm_set1_epi8(tag)));
	}

	uint16_t rotate(uint16_t mask, uint32_t i) {
	return (mask >> i) \| (mask << (BUCKET_KEYS - i));
	}

	uint16_t next_match(uint16_t mask) {
	return mask & (mask - 1);
	}

	uint16_t match_index(uint16_t mask) {
	return __builtin_ctz(mask);
	}

	#pragma align(64)
	typedef struct {
	uint8_t tags[16 / (8 / TAG_BITS)];
	Key keys[BUCKET_KEYS]; // BUCKET_KEYS must be <= 16
	} Bucket;

	typedef struct {
	Bucket *buckets;
	uint32_t mask;
	} Table;

	INLINE // Inline the scalar fast path and allow the vectorized probe loop to be outlined.
	int get(Table *table, uint32_t hash, uint8_t tag, Key key) {
	uint32_t i = hash & table->mask, k = hash >> 28;
	if (k >= BUCKET_KEYS) k -= BUCKET_KEYS; // One reduction is enough when k has clog2(BUCKET_KEYS) bits.
	Bucket *b = &table->buckets[i];
	// Scalar probe to reduce the latency for the hole-in-one case. If the tag isn't redundant you need to check that too.
	if (likely(b->keys[k] == key)) return k + i * BUCKET_KEYS;
	return probe(table, b, tag, key, i, k);
	}

	int probe(Table table, Bucket b, uint8_t tag, Key key, uint32_t i, uint32_t k) {
	uint32_t d = 1;
	for (;;) {
	uint8x16_t t = unpack(b->tags);
	// Intra-bucket linear probing with vectorized tag filtering.
	for (uint32_t m = rotate(match(t, tag), k); likely(m != 0); m = next_match(m)) {
	uint32_t j = match_index(m) + k;
	if (unlikely(j >= BUCKET_KEYS)) {
	// Handle wrap-around by tail duplication and loop rotation.
	j -= BUCKET_KEYS;
	uint32_t k2 = k - BUCKET_KEYS;
	for (;;) {
	if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
	m = next_match(m);
	if (unlikely(m == 0)) goto not_found;
	j = match_index(m) + k2;
	}
	}
	if (likely(b->keys[j] == key)) return j + i * BUCKET_KEYS;
	}
	not_found:
	if (likely(match(t, EMPTY))) return -1;
	i = (i + d++) & table->mask; // Inter-bucket triangular probing.
	b = &table->buckets[i];
	}
	}