Last active
November 26, 2019 08:53
-
-
Save ayende/5c731c20c53b9cba1c8d984e8486f801 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <malloc.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <inttypes.h> | |
#include <stdbool.h> | |
#include "wyhash.h" // https://github.com/wangyi-fudan/wymlp/blob/master/wyhash.h | |
#define VALIDATE 0 | |
#define HASH_BUCKET_PAGE_SIZE 8192 | |
#define NUMBER_OF_HASH_BUCKET_PIECES 127 | |
#define PIECE_BUCKET_BUFFER_SIZE 63 | |
typedef struct hash_bucket_piece { | |
uint8_t bytes_used; | |
uint8_t data[PIECE_BUCKET_BUFFER_SIZE]; | |
} hash_bucket_piece_t; | |
typedef struct hash_bucket { | |
union { | |
struct { | |
uint64_t number_of_entries; | |
uint8_t depth; | |
bool seen; | |
}; | |
uint8_t _padding[64]; | |
}; | |
hash_bucket_piece_t pieces[NUMBER_OF_HASH_BUCKET_PIECES]; | |
} hash_bucket_t; | |
typedef struct hash_directory { | |
uint64_t number_of_entries; | |
uint32_t number_of_buckets; | |
uint32_t directory_pages; | |
uint8_t depth; | |
hash_bucket_t* buckets[0]; | |
} hash_directory_t; | |
typedef struct hash_ctx { | |
void* (*allocate_page)(uint32_t n); | |
void (*release_page)(void* p); | |
hash_directory_t* dir; | |
} hash_ctx_t; | |
uint64_t hash(uint64_t x) { | |
x = (x ^ (x >> 30))* UINT64_C(0xbf58476d1ce4e5b9); | |
x = (x ^ (x >> 27))* UINT64_C(0x94d049bb133111eb); | |
x = x ^ (x >> 31); | |
return x; | |
} | |
void varint_decode(uint8_t** buf, uint64_t* val) { | |
uint64_t result = 0; | |
uint32_t shift = 0; | |
uint8_t* ptr = *buf; | |
uint8_t cur = 0; | |
do | |
{ | |
cur = *ptr++; | |
result |= (uint64_t)(cur & 0x7f) << shift; | |
shift += 7; | |
} while (cur & 0x80); | |
*val = result; | |
*buf = ptr; | |
} | |
void varint_encode(uint64_t val, uint8_t** buf) { | |
uint8_t* ptr = *buf; | |
while (val >= 0x80) { | |
*ptr++ = ((uint8_t)val | 0x80); | |
val >>= 7; | |
} | |
*ptr++ = (uint8_t)(val); | |
*buf = ptr; | |
} | |
void print_bits(FILE* fd, uint64_t v, int depth) | |
{ | |
uint8_t* b = (uint8_t*)&v; | |
uint8_t byte; | |
int i, j; | |
int bits = sizeof(uint64_t) * 8; | |
bool foundNonZero = false; | |
for (i = sizeof(uint64_t) - 1; i >= 0; i--) | |
{ | |
for (j = 7; j >= 0; j--) | |
{ | |
bits--; | |
byte = (b[i] >> j) & 1; | |
if (byte) { | |
foundNonZero = true; | |
} | |
if (!foundNonZero && bits > depth&& bits > 8) | |
continue; | |
fprintf(fd, "%u", byte); | |
if (bits == depth) | |
fprintf(fd, " "); | |
} | |
} | |
} | |
void print_hash_stats(hash_ctx_t* ctx) { | |
printf("Depth: %i - Entries: %I64u, Buckets: %i \n", ctx->dir->depth, ctx->dir->number_of_entries, ctx->dir->number_of_buckets); | |
for (size_t i = 0; i < ctx->dir->number_of_buckets; i++) | |
ctx->dir->buckets[i]->seen = false; | |
uint32_t min = 64, max = 0, total = 0,empties=0; | |
uint64_t sum = 0; | |
for (size_t i = 0; i < ctx->dir->number_of_buckets; i++) | |
{ | |
hash_bucket_t* b = ctx->dir->buckets[i]; | |
if (b->seen) | |
continue; | |
b->seen = false; | |
size_t total_used = 0; | |
for (size_t i = 0; i < NUMBER_OF_HASH_BUCKET_PIECES; i++) { | |
total_used += b->pieces[i].bytes_used; | |
if (b->pieces[i].bytes_used) | |
min = min < b->pieces[i].bytes_used ? min : b->pieces[i].bytes_used; | |
else | |
empties++; | |
max = max > b->pieces[i].bytes_used ? max : b->pieces[i].bytes_used; | |
sum += b->pieces[i].bytes_used; | |
total++; | |
} | |
//printf("%p - Depth: %i, Entries: %I64u, Size: %i\n", b, b->depth, b->number_of_entries, total_used); | |
} | |
printf("Total: %i, Min: %i, Max: %iu, Sum: %llu, Empties: %i, Avg: %f\n", total, min, max, sum, empties, sum / (float)total); | |
} | |
void print_bucket(FILE* fd, hash_bucket_t* b, uint32_t idx) { | |
size_t total_used = 0; | |
for (size_t i = 0; i < NUMBER_OF_HASH_BUCKET_PIECES; i++) { | |
total_used += b->pieces[i].bytes_used; | |
} | |
fprintf(fd, "\tbucket_%p [label=\"Depth: %i, Entries: %I64u, Size: %i, Index: %i\\l--------\\l", b, b->depth, b->number_of_entries, total_used, idx); | |
for (size_t i = 0; i < NUMBER_OF_HASH_BUCKET_PIECES; i++) { | |
hash_bucket_piece_t* p = &b->pieces[i]; | |
uint8_t* buf = p->data; | |
uint8_t* end = p->data + p->bytes_used; | |
while (buf < end) | |
{ | |
uint64_t k, v; | |
varint_decode(&buf, &k); | |
varint_decode(&buf, &v); | |
print_bits(fd, k, b->depth); | |
fprintf(fd, " \\| %4llu = %4llu\\l", k, v); | |
} | |
} | |
fprintf(fd, "\"]\n"); | |
} | |
void print_dir_graphviz_to_file(FILE* fd, hash_ctx_t* ctx) { | |
fprintf(fd, "digraph hash {\n\tnode[shape = record ]; \n"); | |
fprintf(fd, "\ttable [label=\"Depth: %i, Size: %i\\lPages: %i, Entries: %I64u\\l\"]\n", ctx->dir->depth, ctx->dir->number_of_buckets, ctx->dir->directory_pages, ctx->dir->number_of_entries); | |
fprintf(fd, "\tbuckets [label=\""); | |
for (size_t i = 0; i < ctx->dir->number_of_buckets; i++) | |
{ | |
if (i != 0) | |
fprintf(fd, "|"); | |
fprintf(fd, "<bucket_%i> %i - %p ", i, i, ctx->dir->buckets[i]); | |
ctx->dir->buckets[i]->seen = false; | |
} | |
fprintf(fd, "\"]\n"); | |
for (size_t i = 0; i < ctx->dir->number_of_buckets; i++) | |
{ | |
if (ctx->dir->buckets[i]->seen) | |
continue; | |
ctx->dir->buckets[i]->seen = true; | |
print_bucket(fd, ctx->dir->buckets[i], i); | |
} | |
for (size_t i = 0; i < ctx->dir->number_of_buckets; i++) { | |
fprintf(fd, "\tbuckets:bucket_%i -> bucket_%p;\n", i, ctx->dir->buckets[i]); | |
} | |
fprintf(fd, "\ttable->buckets;\n}\n"); | |
} | |
void print_dir_graphviz(hash_ctx_t* ctx) { | |
print_dir_graphviz_to_file(stdout, ctx); | |
} | |
void write_dir_graphviz(hash_ctx_t* ctx, const char* prefix) { | |
static int counter = 0; | |
uint8_t buffer[256]; | |
memset(buffer, 0, sizeof buffer); | |
counter++; | |
sprintf_s(buffer, sizeof buffer, "%s-%i.txt", prefix, counter); | |
FILE* f = NULL; | |
errno_t err = fopen_s(&f, buffer, "w"); | |
if (err) | |
{ | |
printf("Failed to open %s - %i", buffer, err); | |
return; | |
} | |
print_dir_graphviz_to_file(f, ctx); | |
if (f != 0) | |
fclose(f); | |
char* copy = _strdup(buffer); | |
memset(buffer, 0, sizeof buffer); | |
sprintf_s(buffer, sizeof buffer, "C:\\Users\\ayende\\Downloads\\graphviz-2.38\\release\\bin\\dot.exe -Tsvg %s > %s.svg", copy, copy); | |
system(buffer); | |
memset(buffer, 0, sizeof buffer); | |
sprintf_s(buffer, sizeof buffer, "%s.svg", copy); | |
system(buffer); | |
free(copy); | |
} | |
uint32_t hash_table_bucket_number(hash_ctx_t* ctx, uint64_t h) { | |
return h & (((uint64_t)1 << ctx->dir->depth) - 1); | |
} | |
size_t hash_table_get_directory_capacity(hash_ctx_t* ctx) { | |
return ((ctx->dir->directory_pages * HASH_BUCKET_PAGE_SIZE) - sizeof(hash_directory_t)) / sizeof(hash_bucket_t*); | |
} | |
hash_bucket_t* create_hash_bucket(hash_ctx_t* ctx) { | |
hash_bucket_t* b = ctx->allocate_page(1); | |
if (b == NULL) | |
return NULL; | |
memset(b, 0, sizeof(hash_bucket_t)); | |
b->depth = ctx->dir->depth; | |
return b; | |
} | |
bool hash_table_get(hash_ctx_t* ctx, uint64_t key, uint64_t* value) { | |
uint32_t bucket_idx = hash_table_bucket_number(ctx, key); | |
hash_bucket_t* b = ctx->dir->buckets[bucket_idx]; | |
uint32_t piece_idx = key % 127; | |
hash_bucket_piece_t* p = &b->pieces[piece_idx]; | |
uint8_t* buf = p->data; | |
uint8_t* end = p->data + p->bytes_used; | |
while (buf < end) | |
{ | |
uint64_t k, v; | |
varint_decode(&buf, &k); | |
varint_decode(&buf, &v); | |
if (k == key) { | |
*value = v; | |
return true; | |
} | |
} | |
return false; | |
} | |
bool hash_table_piece_append_kv(hash_bucket_t* cur, uint64_t key, uint64_t value) { | |
hash_bucket_piece_t* p = &cur->pieces[key % NUMBER_OF_HASH_BUCKET_PIECES]; | |
uint8_t buffer[20]; | |
uint8_t* write_buf = buffer; | |
varint_encode(key, &write_buf); | |
varint_encode(value, &write_buf); | |
uint8_t size = (uint8_t)(write_buf - buffer); | |
if (size + p->bytes_used > PIECE_BUCKET_BUFFER_SIZE) { | |
return false; | |
} | |
memcpy(p->data + p->bytes_used, buffer, size); | |
p->bytes_used += size; | |
cur->number_of_entries++; | |
return true; | |
} | |
void validate_bucket(hash_ctx_t* ctx, hash_bucket_t* tmp) { | |
#if VALIDATE | |
uint64_t mask = ((uint64_t)1 << tmp->depth) - 1; | |
uint64_t first = 0; | |
bool has_first = false; | |
for (size_t i = 0; i < NUMBER_OF_HASH_BUCKET_PIECES; i++) | |
{ | |
uint8_t* buf = tmp->pieces[i].data; | |
uint8_t* end = buf + tmp->pieces[i].bytes_used; | |
while (buf < end) | |
{ | |
uint64_t k, v; | |
varint_decode(&buf, &k); | |
varint_decode(&buf, &v); | |
if (has_first == false) | |
{ | |
first = k; | |
has_first = true; | |
} | |
if ((k & mask) != (first & mask)) { | |
write_dir_graphviz(ctx, "problem"); | |
printf("Problem\n"); | |
} | |
} | |
} | |
#endif | |
} | |
bool hash_table_put(hash_ctx_t* ctx, uint64_t key, uint64_t value) { | |
uint32_t bucket_idx = hash_table_bucket_number(ctx, key); | |
hash_bucket_t* b = ctx->dir->buckets[bucket_idx]; | |
uint32_t piece_idx = key % 127; | |
hash_bucket_piece_t* p = &b->pieces[piece_idx]; | |
uint8_t tmp_buffer[20]; // each varint can take up to 10 bytes | |
uint8_t* buf_end = tmp_buffer; | |
varint_encode(key, &buf_end); | |
varint_encode(value, &buf_end); | |
ptrdiff_t encoded_size = buf_end - tmp_buffer; | |
uint8_t* buf = p->data; | |
uint8_t* end = p->data + p->bytes_used; | |
while (buf < end) | |
{ | |
uint64_t k, v; | |
uint8_t* cur_buf_start = buf; | |
varint_decode(&buf, &k); | |
varint_decode(&buf, &v); | |
if (k == key) { | |
if (v == value) | |
return true; // nothing to do, value is already there | |
ptrdiff_t diff = buf - cur_buf_start; | |
if (diff == encoded_size) { | |
// new value fit exactly where the old one went, let's put it there | |
memcpy(cur_buf_start, tmp_buffer, encoded_size); | |
validate_bucket(ctx, b); | |
return true; | |
} | |
memmove(cur_buf_start, buf, end - buf); | |
p->bytes_used -= (uint8_t)diff; | |
b->number_of_entries--; | |
ctx->dir->number_of_entries--; | |
break; // found it, let's put this in | |
} | |
} | |
if (p->bytes_used + encoded_size <= PIECE_BUCKET_BUFFER_SIZE) { | |
memcpy(p->data + p->bytes_used, tmp_buffer, encoded_size); | |
p->bytes_used += (uint8_t)encoded_size; | |
b->number_of_entries++; | |
ctx->dir->number_of_entries++; | |
validate_bucket(ctx, b); | |
return true; // was able to update the value in the same piece, done | |
} | |
// there is no room here, need to expand | |
if (ctx->dir->depth == b->depth) { | |
hash_directory_t* new_dir; | |
if (ctx->dir->number_of_buckets * 2 * sizeof(hash_bucket_t*) > hash_table_get_directory_capacity(ctx)) { | |
// have to increase the actual allocated memory here | |
new_dir = ctx->allocate_page(ctx->dir->directory_pages * 2); | |
if (new_dir == NULL) | |
return false; | |
size_t dir_size = ctx->dir->directory_pages * HASH_BUCKET_PAGE_SIZE; | |
memcpy(new_dir, ctx->dir, dir_size); | |
new_dir->directory_pages *= 2; | |
} | |
else { | |
new_dir = ctx->dir; // there is enough space to increase size without allocations | |
} | |
size_t buckets_size = ctx->dir->number_of_buckets * sizeof(hash_bucket_t*); | |
memcpy((uint8_t*)new_dir->buckets + buckets_size, (uint8_t*)ctx->dir->buckets, buckets_size); | |
new_dir->depth++; | |
new_dir->number_of_buckets *= 2; | |
if (new_dir != ctx->dir) { | |
ctx->release_page(ctx->dir); | |
ctx->dir = new_dir; | |
} | |
} | |
//write_dir_graphviz(ctx, "BEFORE"); | |
hash_bucket_t* n = create_hash_bucket(ctx); | |
if (!n) | |
return false; | |
hash_bucket_t* tmp = ctx->allocate_page(1); | |
if (!tmp) { | |
ctx->release_page(n); | |
ctx->release_page(tmp); | |
// no need to release the ctx->dir we allocated, was wired | |
// properly to the table and will be freed with the whole table | |
return false; | |
} | |
memcpy(tmp, b, HASH_BUCKET_PAGE_SIZE); | |
memset(b, 0, sizeof(hash_bucket_t)); | |
memset(n, 0, sizeof(hash_bucket_t)); | |
n->depth = b->depth = tmp->depth + 1; | |
uint32_t bit = 1 << tmp->depth; | |
#if VALIDATE | |
uint64_t mask = (bit >> 1) - 1; | |
bool has_first = false; | |
uint64_t first_key = 0; | |
#endif | |
for (size_t i = 0; i < NUMBER_OF_HASH_BUCKET_PIECES; i++) | |
{ | |
uint8_t* buf = tmp->pieces[i].data; | |
uint8_t* end = buf + tmp->pieces[i].bytes_used; | |
while (buf < end) | |
{ | |
uint64_t k, v; | |
varint_decode(&buf, &k); | |
varint_decode(&buf, &v); | |
#if VALIDATE | |
if (!has_first) { | |
first_key = k; | |
has_first = true; | |
} | |
if ((first_key & mask) != (k & mask)) { | |
printf("mistmatch!: %I64u != %I64u\n", first_key, k); | |
} | |
#endif | |
hash_bucket_t* cur = k & bit ? n : b; | |
bool success = hash_table_piece_append_kv(cur, k, v); | |
#if VALIDATE | |
if (!success) | |
printf("Can't split a page properly? Impossible"); | |
#endif | |
} | |
} | |
ctx->release_page(tmp); | |
for (size_t i = key & (bit - 1); i < ctx->dir->number_of_buckets; i += bit) | |
{ | |
ctx->dir->buckets[i] = i & bit ? n : b; | |
} | |
// now can add the new value in... | |
{ | |
hash_bucket_t* cur = key & bit ? n : b; | |
if (hash_table_piece_append_kv(cur, key, value)) { | |
ctx->dir->number_of_entries++; | |
} | |
else { | |
if (!hash_table_put(ctx, key, value)) { | |
#if VALIDATE | |
printf("Unable to recursively add? That really should never happen."); | |
#endif | |
return false; | |
} | |
} | |
} | |
validate_bucket(ctx, n); | |
validate_bucket(ctx, b); | |
return true; | |
} | |
bool hash_table_init(hash_ctx_t* ctx) { | |
ctx->dir = ctx->allocate_page(1); | |
if (ctx->dir == NULL) | |
return false; | |
memset(ctx->dir, 0, HASH_BUCKET_PAGE_SIZE); | |
ctx->dir->number_of_entries = 0; | |
ctx->dir->number_of_buckets = 2; | |
ctx->dir->directory_pages = 1; | |
ctx->dir->depth = 1; | |
ctx->dir->buckets[0] = create_hash_bucket(ctx); | |
ctx->dir->buckets[1] = create_hash_bucket(ctx); | |
if (!ctx->dir->buckets[0] || !ctx->dir->buckets[1]) { | |
ctx->release_page(ctx->dir->buckets[0]); | |
ctx->release_page(ctx->dir->buckets[1]); | |
ctx->release_page(ctx->dir); | |
return false; | |
} | |
return true; | |
} | |
int allocations = 0; | |
void* allocate_4k_page(uint32_t n) { | |
allocations++; | |
return _aligned_malloc(HASH_BUCKET_PAGE_SIZE * n, HASH_BUCKET_PAGE_SIZE); | |
} | |
void release_4k_page(void* p) { | |
if (!p) | |
return; | |
allocations--; | |
_aligned_free(p); | |
} | |
int main() | |
{ | |
srand(19324); | |
uint32_t const size = 1000000; | |
uint64_t* keys = malloc(size * sizeof(uint64_t)); | |
uint64_t* values = malloc(size * sizeof(uint64_t)); | |
if (!keys || !values) | |
return -1; | |
for (size_t i = 0; i < size; i++) | |
{ | |
keys[i] = i;// wygrand(); | |
values[i] = i + 100;// wygrand(); | |
} | |
struct hash_ctx ctx = { allocate_4k_page, release_4k_page }; | |
if (!hash_table_init(&ctx)) { | |
printf("Failed to init\n"); | |
return -1; | |
} | |
uint64_t first_key = keys[0]; | |
uint64_t first_value = values[0]; | |
for (size_t i = 0; i < size; i++) | |
{ | |
uint64_t v; | |
if (i == 184) { | |
printf("a - "); | |
} | |
if (!hash_table_put(&ctx, keys[i], values[i])) { | |
printf("Failed to put %i\n", i); | |
write_dir_graphviz(&ctx, "PUT-ERR"); | |
return -1; | |
} | |
if (keys[i] == first_key) | |
first_value = values[i]; | |
if (!hash_table_get(&ctx, first_key, &v) || v != first_value) { | |
printf("Failed to get 0 on %i\n", i); | |
write_dir_graphviz(&ctx, "PUT-ERR"); | |
} | |
} | |
//write_dir_graphviz(&ctx, "FINAL"); | |
for (size_t x = 0; x < size; x++) | |
{ | |
uint64_t val; | |
if (!hash_table_get(&ctx, keys[x], &val)) { | |
printf("Failed to get %i\n", x); | |
write_dir_graphviz(&ctx, "GET-ERR"); | |
return -1; | |
} | |
if (val != values[x]) { | |
printf("mismatch on %I32u - %I64u - %I64u, %I64u \n", x, keys[x], val, values[x]); | |
} | |
} | |
print_hash_stats(&ctx); | |
printf("%f MB\n", ((double)allocations * (double)HASH_BUCKET_PAGE_SIZE) / 1024.0 / 1024.0); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment