Last active
November 18, 2020 15:28
-
-
Save vurtun/6d793c8cf097dfaaffaae02ded14b2da to your computer and use it in GitHub Desktop.
hyperlog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define HL_HASH32_COUNT 4 | |
#define HL_SPACE(NUM_REG_BITS) ((1 << NUM_REG_BITS) + 1) | |
static int | |
ilog2(int n) { | |
// clang-format off | |
#define lt(n) n,n,n,n, n,n,n,n, n,n,n,n ,n,n,n,n | |
static const char tbl[256] = {0,0,1,1,2,2,2,2,3,3,3,3, | |
3,3,3,3,lt(4),lt(5),lt(5),lt(6),lt(6),lt(6),lt(6), | |
lt(7),lt(7),lt(7),lt(7),lt(7),lt(7),lt(7),lt(7) | |
}; int tt, t; | |
if ((tt = (n >> 16))) | |
return (t = (tt >> 8)) ? 24+tbl[t]: 16+tbl[tt]; | |
else return (t = (n >> 8)) ? 8+tbl[t]: tbl[n]; | |
#undef lt | |
// clang-format on | |
} | |
static void | |
hl_init(unsigned char *log, size_t size) { | |
memset(log, 0, size); | |
log[0] = ilog2(size - 1); | |
} | |
static void | |
hl_add(unsigned char *logc, unsigned *hash4) { | |
unsigned char *log = logc + 1; | |
size_t size = 1u << logc[0]; | |
assert(size > 0); | |
assert(size & (size - 1) == 0u); | |
unsigned reg_idx = hash4[3] & (size - 1); | |
// use the other 96 bits as our "unique number" corresponding to our object. | |
// Note that we don't use the high 32 bits that we've already used for the | |
// register index because that would bias our results. Certain values seen | |
// would ONLY get tracked by specific registers. That's ok though that we are | |
// only using 96 bits because that is still an astronomically large value. | |
// Most HyperLogLog implementations use only 32 bits, while google uses 64 | |
// bits. We are doing even larger with 96 bits. 2^(bits) = how many unique | |
// items you can track. | |
// | |
// 2^32 = 4 billion <--- what most people use | |
// 2^64 = 18 billion billion <--- what google uses | |
// 2^96 = 79 billion billion billion <--- what we are using | |
// 2^128 = 340 billion billion billion billion <--- way huger than anyone | |
// needs. Beyond astronomical. | |
// Figure out where the highest 1 bit is | |
unsigned long bit_set = 0; | |
if (_BitScanForward(&bit_set, hash4[0])) { | |
bit_set += 1; | |
} else if (_BitScanForward(&bit_set, hash4[1])) { | |
bit_set += 32 + 1; | |
} else if (_BitScanForward(&bit_set, hash4[2])) { | |
bit_set += 64 + 1; | |
} | |
// store the highest seen value for that register | |
assert(bit_set < 256); | |
unsigned char val = (unsigned char)bit_set; | |
if (log[reg_idx] < val) { | |
log[reg_idx] = val; | |
} | |
} | |
static unsigned | |
hl_empty_reg(const unsigned char *log) { | |
unsigned ret = 0; | |
size_t size = 1u << log[0]; | |
for (size_t i = 0u; i < size; ++i) { | |
if (!log[i + 1]) { | |
ret++; | |
} | |
} | |
return ret; | |
} | |
static float | |
hl_cnt(const unsigned char *logc) { | |
size_t size = 1u << logc[0]; | |
unsigned char *log = log_con + 1; | |
// calculate sum | |
float sum = 0.0f; | |
for (size_t i = 0u; i < size; ++i) { | |
sum += pow(2.0f, -(float)log[i]); | |
} | |
// calculate dv estimate | |
const float c_alpha = 0.7213f / (1.0f + 1.079f / size); | |
float dv_est = c_alpha * ((float)size / sum) * (float)size; | |
// small range correction | |
if (dv_est < 5.0f / 2.0f * (float)size) { | |
// if no empty registers, use the estimate we already have | |
unsigned empty_reg = hl_empty_reg(log, size); | |
if (empty_reg == 0) { | |
return dv_est; | |
} | |
// balls and bins correction | |
return (float)size * log((float)size / (float)empty_reg); | |
} | |
// large range correction | |
if (dv_est > 143165576.533f) { // 2^32 / 30 | |
return -pow(2.0f, 32.0f) * log(1.0f - dv_est / pow(2.0f, 32.0f)); | |
} | |
return dv_est; | |
} | |
static void | |
hl_union(unsigned char *ab_union, const unsigned char *a, | |
const unsigned char *b) { | |
// To make a union between two hyperloglog objects that have the same number | |
// of registers and use the same hash, just take the maximum of each register | |
// between the two objects. This operation is "lossless" in that you end up | |
// with the same registers as if you actually had another object that tracked | |
// the items each individual object tracked. | |
assert(a[0] == b[0]); | |
size_t size = 1u << a[0]; | |
for (size_t i = 0u; i < size; ++i) { | |
ab_union[i + 1] = max(a[i + 1], b[i + 1]); | |
} | |
ab_union[0] = a[0]; | |
} | |
static float | |
hl_cnt_union(const unsigned char *a, const unsigned char *b) { | |
assert(a[0] == b[0]); | |
unsigned char ab_union[HL_SPACE(a[0])]; // VLA :( | |
hl_union(ab_union, a, b); | |
return hl_cnt(ab_union); | |
} | |
static float | |
hl_intersect_cnt(float a_cnt, float b_cnt, float ab_union_cnt) { | |
// We have to use the inclusion-exclusion principle to get an intersection | |
// estimate count(Intersection(A,B)) = (count(A) + count(B)) - | |
// count(Union(A,B)) | |
// http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle | |
return a_cnt + b_cnt - ab_union_cnt; | |
} | |
static float | |
hl_cnt_intersect(const unsigned char *a, const unsigned char *b) { | |
assert(a[0] == b[0]); | |
unsigned char ab_union[HL_SPACE(a[0])]; // VLA :( | |
hl_union(ab_union, a, b); | |
float a_cnt = hl_cnt(a); | |
float b_cnt = hl_cnt(b); | |
float ab_union_cnt = hl_cnt(ab_union); | |
return hl_intersect_cnt(a_cnt, b_cnt, ab_union_cnt); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inline uint32_t | |
getblock32 (const uint32_t * p, int i){ | |
return p[i]; | |
} | |
inline uint32_t | |
fmix32 (uint32_t h) { | |
h ^= h >> 16; | |
h *= 0x85ebca6b; | |
h ^= h >> 13; | |
h *= 0xc2b2ae35; | |
h ^= h >> 16; | |
return h; | |
} | |
static void | |
MurmurHash3_x86_128 (const void * key, const int len, unsigned seed, unsigned *out){ | |
#define ROTL32(x,y) _rotl(x,y) | |
const unsigned char *data = (const unsigned char*)key; | |
const int nblocks = len / 16; | |
unsigned h1 = seed; | |
unsigned h2 = seed; | |
unsigned h3 = seed; | |
unsigned h4 = seed; | |
const unsigned c1 = 0x239b961b; | |
const unsigned c2 = 0xab0e9789; | |
const unsigned c3 = 0x38b34ae5; | |
const unsigned c4 = 0xa1e38b93; | |
// body | |
const unsigned * blocks = (const unsigned *)(data + nblocks * 16); | |
for (int i = -nblocks; i; i++) { | |
unsigned k1 = getblock32(blocks, i * 4 + 0); | |
unsigned k2 = getblock32(blocks, i * 4 + 1); | |
unsigned k3 = getblock32(blocks, i * 4 + 2); | |
unsigned k4 = getblock32(blocks, i * 4 + 3); | |
k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; | |
h1 = ROTL32(h1, 19); h1 += h2; h1 = h1 * 5 + 0x561ccd1b; | |
k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; | |
h2 = ROTL32(h2, 17); h2 += h3; h2 = h2 * 5 + 0x0bcaa747; | |
k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; | |
h3 = ROTL32(h3, 15); h3 += h4; h3 = h3 * 5 + 0x96cd1c35; | |
k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; | |
h4 = ROTL32(h4, 13); h4 += h1; h4 = h4 * 5 + 0x32ac3b17; | |
} | |
// tail | |
const unsigned char * tail = (const unsigned char*)(data + nblocks * 16); | |
unsigned k1 = 0, k2 = 0, k3 = 0, k4 = 0; | |
switch (len & 15) { | |
case 15: k4 ^= tail[14] << 16; | |
case 14: k4 ^= tail[13] << 8; | |
case 13: k4 ^= tail[12] << 0; | |
k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; | |
case 12: k3 ^= tail[11] << 24; | |
case 11: k3 ^= tail[10] << 16; | |
case 10: k3 ^= tail[9] << 8; | |
case 9: k3 ^= tail[8] << 0; | |
k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; | |
case 8: k2 ^= tail[7] << 24; | |
case 7: k2 ^= tail[6] << 16; | |
case 6: k2 ^= tail[5] << 8; | |
case 5: k2 ^= tail[4] << 0; | |
k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; | |
case 4: k1 ^= tail[3] << 24; | |
case 3: k1 ^= tail[2] << 16; | |
case 2: k1 ^= tail[1] << 8; | |
case 1: k1 ^= tail[0] << 0; | |
k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; | |
}; | |
// finalization | |
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; | |
h1 += h2; h1 += h3; h1 += h4; | |
h2 += h1; h3 += h1; h4 += h1; | |
h1 = fmix32(h1); | |
h2 = fmix32(h2); | |
h3 = fmix32(h3); | |
h4 = fmix32(h4); | |
h1 += h2; h1 += h3; h1 += h4; | |
h2 += h1; h3 += h1; h4 += h1; | |
out[0] = h1; | |
out[1] = h2; | |
out[2] = h3; | |
out[3] = h4; | |
} | |
static void | |
hash_bytes(unsigned* ret4, const void *key, size_t len) { | |
static const size_t c_minLen = 16; | |
static const unsigned c_seed = 0x2e715b3d; | |
if (len < c_minLen) { | |
unsigned char buf[c_minLen]; | |
for (size_t i = 0; i < len; ++i) | |
buf[i] = ((unsigned char*)key)[i]; | |
for (size_t i = len; i < c_minLen; ++i) | |
buf[i] = buf[i%len] + i; | |
MurmurHash3_x86_128(buf, c_minLen, c_seed, ret); | |
} else MurmurHash3_x86_128(key, len, c_seed, ret); | |
} | |
static void | |
hl_add_murmur3(unsigned char* hl, const void *key, size_t len) { | |
unsigned hash[HL_HASH32_COUNT]; | |
hash_bytes(hash, key, len); | |
hl_add(hl, hash); | |
} | |
int | |
main(void) { | |
unsigned char log[HL_SPACE(10)]; | |
hl_init(log, sizeof(log)); | |
hl_add_murmur3(log, ...); | |
hl_add_murmur3(log, ...); | |
hl_add_murmur3(log, ...); | |
hl_add_murmur3(log, ...); | |
hl_add_murmur3(log, ...); | |
float cnt = hl_cnt(log); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment