Skip to content

Instantly share code, notes, and snippets.

@m1lkweed
Created February 29, 2024 07:28
Show Gist options
  • Save m1lkweed/66a2aea3f32c35b1f5c20af1c4bf4e6e to your computer and use it in GitHub Desktop.
Save m1lkweed/66a2aea3f32c35b1f5c20af1c4bf4e6e to your computer and use it in GitHub Desktop.
HyperLogLog in C
// (c)m1lkweed 2024 GPLv3+
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h> // for rand()
uint64_t hash(void *in_data, size_t len){
char *data = in_data;
uint64_t h = 0x100;
for(size_t i = 0; i < len; ++i){
h ^= data[i];
h *= 1111111111111111111u;
}
return h;
}
// Approximately returns the number of unique entries in O(n) time
size_t hyperloglog(size_t nelems, size_t elem_size, void *data, uint64_t(*hash)(void*, size_t)){
// We want to note the longest run of trailing zeros but we don't want an unlucky
// hash to inflate our guess, so we spread the value into one of 16 buckets and get
// the harmonic mean.
uint8_t buckets[16] = {};
for(size_t i = 0; i < nelems; i += elem_size){
uint64_t res = hash(data + i, elem_size);
uint8_t ctz = __builtin_ctz(res);
if(buckets[(res >> 60)] < ctz)
buckets[(res >> 60)] = ctz; // Use first 4 bits as index
}
// Harmonic mean is the recip. of all elems divided by # of elems (16 here),
// probability of hash is 1/(2**n); simplifies to (2**n)/16
float t[16] = {
(1 << buckets[0x0]), (1 << buckets[0x1]), (1 << buckets[0x2]), (1 << buckets[0x3]),
(1 << buckets[0x4]), (1 << buckets[0x5]), (1 << buckets[0x6]), (1 << buckets[0x7]),
(1 << buckets[0x8]), (1 << buckets[0x9]), (1 << buckets[0xA]), (1 << buckets[0xB]),
(1 << buckets[0xC]), (1 << buckets[0xD]), (1 << buckets[0xE]), (1 << buckets[0xF])
};
return (
t[0x0] + t[0x1] + t[0x2] + t[0x3] + t[0x4] + t[0x5] + t[0x6] + t[0x7] +
t[0x8] + t[0x9] + t[0xA] + t[0xB] + t[0xC] + t[0xD] + t[0xE] + t[0xF]
) / 16;
}
int main(){
float data[1000000] = {};
for(int i = 0; i < 1000000; ++i)
data[i] = rand() % 1000000; // I beg you, do not use rand()
printf("%zu / 1000000\n", hyperloglog(1000000, sizeof(float), data, hash));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment