Skip to content

Instantly share code, notes, and snippets.

@skeeto
Last active August 11, 2018 02:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save skeeto/2995079c02b8839d5a45108f25d632bc to your computer and use it in GitHub Desktop.
Save skeeto/2995079c02b8839d5a45108f25d632bc to your computer and use it in GitHub Desktop.
MurmurHash3 vs. hashCode
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
unsigned
hashCode(const unsigned short *s, size_t len)
{
unsigned hash = 0;
for (size_t i = 0; i < len; i++)
hash = 31 * hash + s[i];
return hash;
}
uint32_t
murmurhash3(const void *key, uint32_t len, uint32_t seed)
{
uint32_t hash = seed;
uint32_t nblocks = len / 4;
const uint32_t *blocks = (const uint32_t *)key;
for (uint32_t i = 0; i < nblocks; i++) {
uint32_t k = blocks[i];
k *= UINT32_C(0xcc9e2d51);
k = (k << 15) | (k >> 17);
k *= UINT32_C(0x1b873593);
hash ^= k;
hash = ((hash << 13) | (hash >> 19)) * 5 + UINT32_C(0xe6546b64);
}
const uint8_t *tail = (const uint8_t *)key + nblocks * 4;
uint32_t k1 = 0;
switch (len & UINT32_C(3)) {
case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */
case 2: k1 ^= tail[1] << 8; /* FALLTHROUGH */
case 1: k1 ^= tail[0];
k1 *= UINT32_C(0xcc9e2d51);
k1 = (k1 << 15) | (k1 >> 17);
k1 *= UINT32_C(0x1b873593);
hash ^= k1;
}
hash ^= len;
hash ^= hash >> 16;
hash *= UINT32_C(0x85ebca6b);
hash ^= hash >> 13;
hash *= UINT32_C(0xc2b2ae35);
hash ^= hash >> 16;
return hash;
}
unsigned
dumb(unsigned char *key, size_t len)
{
unsigned hash = 0;
for (size_t i = 0; i < len; i++) {
hash += key[i];
hash ^= hash >> 16;
hash *= 0xa871304d;
hash ^= hash >> 16;
}
return hash;
}
unsigned char *
utf8_decode(unsigned char *s, long *c)
{
unsigned char *next;
if (s[0] < 0x80) {
*c = s[0];
next = s + 1;
} else if ((s[0] & 0xe0) == 0xc0) {
*c = ((long)(s[0] & 0x1f) << 6) |
((long)(s[1] & 0x3f) << 0);
next = s + 2;
} else if ((s[0] & 0xf0) == 0xe0) {
*c = ((long)(s[0] & 0x0f) << 12) |
((long)(s[1] & 0x3f) << 6) |
((long)(s[2] & 0x3f) << 0);
next = s + 3;
} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
*c = ((long)(s[0] & 0x07) << 18) |
((long)(s[1] & 0x3f) << 12) |
((long)(s[2] & 0x3f) << 6) |
((long)(s[3] & 0x3f) << 0);
next = s + 4;
} else {
*c = -1; // invalid
next = s + 1; // skip this byte
}
if (*c >= 0xd800 && *c <= 0xdfff)
*c = -1; // surrogate half
return next;
}
int
main(void)
{
char word[256];
long collisions_hashcode = 0;
long collisions_murmurhash3 = 0;
long collisions_dumb = 0;
unsigned long long *seen_hashcode, *seen_murmurhash3, *seen_dumb;
seen_hashcode = calloc(1ULL << 26, sizeof(unsigned long long));
seen_murmurhash3 = calloc(1ULL << 26, sizeof(unsigned long long));
seen_dumb = calloc(1ULL << 26, sizeof(unsigned long long));
while (fgets(word, sizeof(word), stdin)) {
/* Convert to UTF-16 */
long c;
int len = 0;
unsigned short string[256];
unsigned char *p = (unsigned char *)word;
for (;;) {
p = utf8_decode(p, &c);
if (c == -1 || c == '\n' || c == 0)
break;
string[len++] = c;
}
/* Hash as UTF-16 */
unsigned hash_hc = hashCode(string, len);
if ((seen_hashcode[hash_hc / 64] >> (hash_hc % 64)) & 1) {
collisions_hashcode++;
} else {
seen_hashcode[hash_hc / 64] |= 1ULL << (hash_hc % 64);
}
/* Hash as UTF-8 */
unsigned hash_mh3 = murmurhash3(word, (char *)p - word, 0);
if ((seen_murmurhash3[hash_mh3 / 64] >> (hash_mh3 % 64)) & 1) {
collisions_murmurhash3++;
} else {
seen_murmurhash3[hash_mh3 / 64] |= 1ULL << (hash_mh3 % 64);
}
/* Hash as UTF-8 */
unsigned hash_dumb = dumb((unsigned char *)word, (char *)p - word);
if ((seen_dumb[hash_dumb / 64] >> (hash_dumb % 64)) & 1) {
collisions_dumb++;
} else {
seen_dumb[hash_dumb / 64] |= 1ULL << (hash_dumb % 64);
}
}
printf("hashCode %ld\n", collisions_hashcode);
printf("MurmurHash3 %ld\n", collisions_murmurhash3);
printf("dumb %ld\n", collisions_dumb);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment