Created
February 29, 2024 07:28
-
-
Save m1lkweed/66a2aea3f32c35b1f5c20af1c4bf4e6e to your computer and use it in GitHub Desktop.
HyperLogLog in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// (c)m1lkweed 2024 GPLv3+ | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <stddef.h> | |
#include <stdlib.h> // for rand() | |
uint64_t hash(void *in_data, size_t len){ | |
char *data = in_data; | |
uint64_t h = 0x100; | |
for(size_t i = 0; i < len; ++i){ | |
h ^= data[i]; | |
h *= 1111111111111111111u; | |
} | |
return h; | |
} | |
// Approximately returns the number of unique entries in O(n) time | |
size_t hyperloglog(size_t nelems, size_t elem_size, void *data, uint64_t(*hash)(void*, size_t)){ | |
// We want to note the longest run of trailing zeros but we don't want an unlucky | |
// hash to inflate our guess, so we spread the value into one of 16 buckets and get | |
// the harmonic mean. | |
uint8_t buckets[16] = {}; | |
for(size_t i = 0; i < nelems; i += elem_size){ | |
uint64_t res = hash(data + i, elem_size); | |
uint8_t ctz = __builtin_ctz(res); | |
if(buckets[(res >> 60)] < ctz) | |
buckets[(res >> 60)] = ctz; // Use first 4 bits as index | |
} | |
// Harmonic mean is the recip. of all elems divided by # of elems (16 here), | |
// probability of hash is 1/(2**n); simplifies to (2**n)/16 | |
float t[16] = { | |
(1 << buckets[0x0]), (1 << buckets[0x1]), (1 << buckets[0x2]), (1 << buckets[0x3]), | |
(1 << buckets[0x4]), (1 << buckets[0x5]), (1 << buckets[0x6]), (1 << buckets[0x7]), | |
(1 << buckets[0x8]), (1 << buckets[0x9]), (1 << buckets[0xA]), (1 << buckets[0xB]), | |
(1 << buckets[0xC]), (1 << buckets[0xD]), (1 << buckets[0xE]), (1 << buckets[0xF]) | |
}; | |
return ( | |
t[0x0] + t[0x1] + t[0x2] + t[0x3] + t[0x4] + t[0x5] + t[0x6] + t[0x7] + | |
t[0x8] + t[0x9] + t[0xA] + t[0xB] + t[0xC] + t[0xD] + t[0xE] + t[0xF] | |
) / 16; | |
} | |
int main(){ | |
float data[1000000] = {}; | |
for(int i = 0; i < 1000000; ++i) | |
data[i] = rand() % 1000000; // I beg you, do not use rand() | |
printf("%zu / 1000000\n", hyperloglog(1000000, sizeof(float), data, hash)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment