Skip to content

Instantly share code, notes, and snippets.

@kprotty
Created August 8, 2023 15:28
Show Gist options
  • Save kprotty/67e591fbaa2d3f23764032d19315a0ad to your computer and use it in GitHub Desktop.
Save kprotty/67e591fbaa2d3f23764032d19315a0ad to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cassert>
#include <x86intrin.h>
typedef struct {
__m256i state[2];
__m256i counter;
__m256i output;
} shishua_rng;
static inline void shishua_rng_init(shishua_rng* rng) {
uint64_t s0[4] = {0xb16739c6ab008a82ULL, 0xb9b1d61724fd10caULL, 0x700ce2d32bbd85f4ULL, 0x2517e0dbea93e94cULL};
uint64_t s1[4] = {0x8fb19220c769a0f4ULL, 0xf8853f8d9e74c5a2ULL, 0x56f8e1aee7550718ULL, 0x71ccfaab44595151ULL};
uint64_t count[4] = {28, 20, 12, 4};
rng->state[0] = _mm256_loadu_si256((const __m256i*)&s0);
rng->state[1] = _mm256_loadu_si256((const __m256i*)&s1);
rng->counter = _mm256_loadu_si256((const __m256i*)&count);
rng->output = rng->state[1];
}
static inline void shishua_rng_update(shishua_rng *rng) {
rng->state[1] = _mm256_add_epi64(rng->state[1], rng->counter);
rng->counter = _mm256_add_epi64(rng->counter, _mm256_set_epi64x(1, 3, 5, 7));
__m256i u0 = _mm256_srli_epi64(rng->state[0], 1);
__m256i u1 = _mm256_srli_epi64(rng->state[0], 3);
__m256i t0 = _mm256_permutevar8x32_epi32(rng->state[0], _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5));
__m256i t1 = _mm256_permutevar8x32_epi32(rng->state[1], _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3));
rng->state[0] = _mm256_add_epi64(t0, u0);
rng->state[1] = _mm256_add_epi64(t1, u1);
rng->output = _mm256_xor_si256(t1, u0);
}
static inline uint32_t avalanche(uint64_t x) {
return (x * 11400714819323198485ULL) >> 32;
}
static inline void nh_round(__m256i *acc, __m256i m, __m256i r) {
__m256i mixed = _mm256_add_epi32(m, r);
__m256i hi = _mm256_srli_epi64(mixed, 32);
__m256i merged = _mm256_mul_epu32(mixed, hi);
*acc = _mm256_add_epi64(*acc, merged);
}
uint32_t nh_hash(const char *ptr, size_t len) {
shishua_rng rng;
shishua_rng_init(&rng);
if (__builtin_expect(len <= 16, 1)) {
if (__builtin_expect(len <= 3, 1)) {
if (len == 0) return 0;
uint32_t word;
char bytes[4] = {ptr[0], ptr[len / 2], ptr[len - 1], (char)len};
memcpy(&word, &bytes, 4);
return avalanche(word);
}
uint32_t m[4];
memcpy(&m[0], ptr, 4);
memcpy(&m[1], ptr + ((len / 8) * 2), 4);
memcpy(&m[2], ptr + len - 4, 4);
memcpy(&m[3], ptr + len - 4 - ((len / 8) * 2), 4);
uint32_t r[4];
memcpy(&r, &rng.output, sizeof(r));
uint64_t a = ((uint64_t)(m[0] + r[0])) * (m[1] + r[1]);
uint64_t b = ((uint64_t)(m[2] + r[2])) * (m[3] + r[3]);
return avalanche(a + b);
}
__m256i m, acc = _mm256_setzero_si256();
while (len > 32) {
m = _mm256_loadu_si256((const __m256i*)ptr);
nh_round(&acc, m, rng.output);
shishua_rng_update(&rng);
ptr += 32;
len -= 32;
}
__m128i a = _mm_loadu_si128((const __m128i*)ptr);
__m128i b = _mm_loadu_si128((const __m128i*)(ptr + len - 16));
nh_round(&acc, _mm256_set_m128i(a, b), rng.output);
// __builtin_reduce_add
a = _mm256_extracti128_si256(acc, 0);
b = _mm256_extracti128_si256(acc, 1);
a = _mm_add_epi64(a, b);
uint64_t result = 0;
result += _mm_extract_epi64(a, 0);
result += _mm_extract_epi64(a, 1);
return avalanche(result);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment