Last active
August 4, 2022 17:00
-
-
Save skeeto/0ec21f9c6d9ea4e72ac6da3c4fe29ce0 to your computer and use it in GitHub Desktop.
Hash set demonstration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// cc -g -maes -o example example.c | |
// https://old.reddit.com/r/C_Programming/comments/w71gmw | |
#include <assert.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <string.h> | |
#if __AES__ || (_MSC_VER && _M_AMD64) | |
# include <smmintrin.h> | |
# include <wmmintrin.h> | |
#endif | |
#define MAX 50 | |
#define HT_LEN 256 | |
#define STR(s) (struct str){s} | |
struct str { unsigned char buf[16]; }; | |
// Compute a 32-bit hash from a 16-byte input string. | |
static uint32_t | |
hash(struct str s) | |
{ | |
#if __AES__ || (_MSC_VER && _M_AMD64) | |
// "Encrypt" the string with two rounds of AES | |
__m128i h, k; | |
k = _mm_loadu_si128((void *)s.buf); | |
h = _mm_aesenc_si128(k, k); | |
h = _mm_aesenc_si128(h, k); | |
return _mm_extract_epi32(h, 0); | |
#else | |
// Portable fallback hash | |
uint64_t a, b; | |
memcpy(&a, s.buf+0, 8); | |
memcpy(&b, s.buf+8, 8); | |
a *= 1111111111111111111; | |
a ^= a >> 33; | |
a += b; | |
a *= 1111111111111111111; | |
a ^= a >> 33; | |
return (uint32_t)a; | |
#endif | |
} | |
// Returns the next hash table index for this hash. Use negative i for | |
// the first call. | |
static int | |
ht_next(uint32_t hash, int i) | |
{ | |
int mask = HT_LEN - 1; | |
int step = (hash>>16 | 1) & mask; | |
return i<0 ? (int)(hash&mask) : (i + step)&mask; | |
} | |
#define COMPUTE_INIT {{{{0}}}, 0, {0}, 0} | |
struct compute { | |
struct str stack[MAX]; | |
int stack_len; | |
int ht[HT_LEN]; | |
int ht_len; | |
}; | |
// Clear the hash table and rebuild it from the stack. | |
static void | |
compute_rehash(struct compute *c) | |
{ | |
// TODO: Consider rekeying the hash table by using a counter as a | |
// hash key and incrementing it upon rehash. This would help break | |
// out of pathological, high collision input sets, whether malicious | |
// or not. | |
memset(c->ht, 0, sizeof(c->ht)); | |
for (int si = 0; si < c->stack_len; si++) { | |
uint32_t h = hash(c->stack[si]); | |
for (int i = -1;;) { | |
i = ht_next(h, i); | |
if (!c->ht[i]) { | |
c->ht[i] = si + 1; | |
break; | |
} | |
} | |
} | |
c->ht_len = c->stack_len; | |
} | |
// Try to push a new string onto the stack, returning 1 on success. | |
static int | |
compute_push(struct compute *c, struct str s) | |
{ | |
assert(c->stack_len < MAX); | |
assert(s.buf[15] == 0); | |
uint32_t h = hash(s); | |
int tombstone = -1; | |
for (int i = -1;;) { | |
i = ht_next(h, i); | |
int si = c->ht[i] - 1; | |
// Empty slot? | |
if (si == -1) { | |
c->stack[c->stack_len++] = s; | |
if (tombstone < 0) { | |
if (c->ht_len > HT_LEN/2) { | |
compute_rehash(c); | |
} else { | |
c->ht[i] = c->stack_len; | |
c->ht_len++; | |
} | |
} else { | |
c->ht[tombstone] = c->stack_len; | |
} | |
return 1; | |
} | |
// Already present? | |
if (si < c->stack_len && !memcmp(s.buf, c->stack[si].buf, 16)) { | |
return 0; | |
} | |
// First tombstone found? | |
if (tombstone < 0 && si >= c->stack_len) { | |
tombstone = i; | |
} | |
} | |
} | |
// Print the current compute state (for debugging). | |
static void | |
compute_print(struct compute *c) | |
{ | |
puts("stack:"); | |
for (int i = 0; i < c->stack_len; i++) { | |
printf("\t%d\t%s\n", i, c->stack[i].buf); | |
} | |
puts("hashtable:"); | |
for (int i = 0; i < HT_LEN; i++) { | |
if (c->ht[i]) { | |
char *note = c->ht[i]<=c->stack_len ? "" : " (tombstone)"; | |
printf("\t%d\t%s%s\n", i, c->stack[c->ht[i]-1].buf, note); | |
} | |
} | |
putchar('\n'); | |
} | |
// Remove the most recently-added item from the compute state. | |
static void | |
compute_backtrack(struct compute *c) | |
{ | |
assert(c->stack_len); | |
c->stack_len--; | |
} | |
int main(void) | |
{ | |
struct compute c = COMPUTE_INIT; | |
struct str test[] = { | |
{"hello"}, | |
{"world"}, | |
{"hello"}, | |
{"world"}, | |
{"example"}, | |
}; | |
for (int i = 0; i < (int)(sizeof(test)/sizeof(*test)); i++) { | |
printf("push(%s)\t= %d\n", test[i].buf, compute_push(&c, test[i])); | |
} | |
compute_print(&c); | |
puts("backtrack()"); | |
compute_backtrack(&c); | |
compute_print(&c); | |
puts("rehash()"); | |
compute_rehash(&c); | |
compute_print(&c); | |
puts("backtrack()"); | |
compute_backtrack(&c); | |
compute_print(&c); | |
// "copped" collides with "world" with HT_LEN==256 and little endian | |
struct str collision = STR("copped"); | |
printf("push(%s)\t= %d\n", collision.buf, compute_push(&c, collision)); | |
compute_print(&c); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment