Last active
August 11, 2018 02:08
-
-
Save skeeto/2995079c02b8839d5a45108f25d632bc to your computer and use it in GitHub Desktop.
MurmurHash3 vs. hashCode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
unsigned | |
hashCode(const unsigned short *s, size_t len) | |
{ | |
unsigned hash = 0; | |
for (size_t i = 0; i < len; i++) | |
hash = 31 * hash + s[i]; | |
return hash; | |
} | |
uint32_t | |
murmurhash3(const void *key, uint32_t len, uint32_t seed) | |
{ | |
uint32_t hash = seed; | |
uint32_t nblocks = len / 4; | |
const uint32_t *blocks = (const uint32_t *)key; | |
for (uint32_t i = 0; i < nblocks; i++) { | |
uint32_t k = blocks[i]; | |
k *= UINT32_C(0xcc9e2d51); | |
k = (k << 15) | (k >> 17); | |
k *= UINT32_C(0x1b873593); | |
hash ^= k; | |
hash = ((hash << 13) | (hash >> 19)) * 5 + UINT32_C(0xe6546b64); | |
} | |
const uint8_t *tail = (const uint8_t *)key + nblocks * 4; | |
uint32_t k1 = 0; | |
switch (len & UINT32_C(3)) { | |
case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */ | |
case 2: k1 ^= tail[1] << 8; /* FALLTHROUGH */ | |
case 1: k1 ^= tail[0]; | |
k1 *= UINT32_C(0xcc9e2d51); | |
k1 = (k1 << 15) | (k1 >> 17); | |
k1 *= UINT32_C(0x1b873593); | |
hash ^= k1; | |
} | |
hash ^= len; | |
hash ^= hash >> 16; | |
hash *= UINT32_C(0x85ebca6b); | |
hash ^= hash >> 13; | |
hash *= UINT32_C(0xc2b2ae35); | |
hash ^= hash >> 16; | |
return hash; | |
} | |
unsigned | |
dumb(unsigned char *key, size_t len) | |
{ | |
unsigned hash = 0; | |
for (size_t i = 0; i < len; i++) { | |
hash += key[i]; | |
hash ^= hash >> 16; | |
hash *= 0xa871304d; | |
hash ^= hash >> 16; | |
} | |
return hash; | |
} | |
unsigned char * | |
utf8_decode(unsigned char *s, long *c) | |
{ | |
unsigned char *next; | |
if (s[0] < 0x80) { | |
*c = s[0]; | |
next = s + 1; | |
} else if ((s[0] & 0xe0) == 0xc0) { | |
*c = ((long)(s[0] & 0x1f) << 6) | | |
((long)(s[1] & 0x3f) << 0); | |
next = s + 2; | |
} else if ((s[0] & 0xf0) == 0xe0) { | |
*c = ((long)(s[0] & 0x0f) << 12) | | |
((long)(s[1] & 0x3f) << 6) | | |
((long)(s[2] & 0x3f) << 0); | |
next = s + 3; | |
} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) { | |
*c = ((long)(s[0] & 0x07) << 18) | | |
((long)(s[1] & 0x3f) << 12) | | |
((long)(s[2] & 0x3f) << 6) | | |
((long)(s[3] & 0x3f) << 0); | |
next = s + 4; | |
} else { | |
*c = -1; // invalid | |
next = s + 1; // skip this byte | |
} | |
if (*c >= 0xd800 && *c <= 0xdfff) | |
*c = -1; // surrogate half | |
return next; | |
} | |
int | |
main(void) | |
{ | |
char word[256]; | |
long collisions_hashcode = 0; | |
long collisions_murmurhash3 = 0; | |
long collisions_dumb = 0; | |
unsigned long long *seen_hashcode, *seen_murmurhash3, *seen_dumb; | |
seen_hashcode = calloc(1ULL << 26, sizeof(unsigned long long)); | |
seen_murmurhash3 = calloc(1ULL << 26, sizeof(unsigned long long)); | |
seen_dumb = calloc(1ULL << 26, sizeof(unsigned long long)); | |
while (fgets(word, sizeof(word), stdin)) { | |
/* Convert to UTF-16 */ | |
long c; | |
int len = 0; | |
unsigned short string[256]; | |
unsigned char *p = (unsigned char *)word; | |
for (;;) { | |
p = utf8_decode(p, &c); | |
if (c == -1 || c == '\n' || c == 0) | |
break; | |
string[len++] = c; | |
} | |
/* Hash as UTF-16 */ | |
unsigned hash_hc = hashCode(string, len); | |
if ((seen_hashcode[hash_hc / 64] >> (hash_hc % 64)) & 1) { | |
collisions_hashcode++; | |
} else { | |
seen_hashcode[hash_hc / 64] |= 1ULL << (hash_hc % 64); | |
} | |
/* Hash as UTF-8 */ | |
unsigned hash_mh3 = murmurhash3(word, (char *)p - word, 0); | |
if ((seen_murmurhash3[hash_mh3 / 64] >> (hash_mh3 % 64)) & 1) { | |
collisions_murmurhash3++; | |
} else { | |
seen_murmurhash3[hash_mh3 / 64] |= 1ULL << (hash_mh3 % 64); | |
} | |
/* Hash as UTF-8 */ | |
unsigned hash_dumb = dumb((unsigned char *)word, (char *)p - word); | |
if ((seen_dumb[hash_dumb / 64] >> (hash_dumb % 64)) & 1) { | |
collisions_dumb++; | |
} else { | |
seen_dumb[hash_dumb / 64] |= 1ULL << (hash_dumb % 64); | |
} | |
} | |
printf("hashCode %ld\n", collisions_hashcode); | |
printf("MurmurHash3 %ld\n", collisions_murmurhash3); | |
printf("dumb %ld\n", collisions_dumb); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment