skeeto/hash.c

## hash.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

unsigned
hashCode(const unsigned short *s, size_t len)
{
    unsigned hash = 0;
    for (size_t i = 0; i < len; i++)
        hash = 31 * hash + s[i];
    return hash;
}

uint32_t
murmurhash3(const void *key, uint32_t len, uint32_t seed)
{
    uint32_t hash = seed;
    uint32_t nblocks = len / 4;
    const uint32_t *blocks = (const uint32_t *)key;
    for (uint32_t i = 0; i < nblocks; i++) {
        uint32_t k = blocks[i];
        k *= UINT32_C(0xcc9e2d51);
        k = (k << 15) | (k >> 17);
        k *= UINT32_C(0x1b873593);
        hash ^= k;
        hash = ((hash << 13) | (hash >> 19)) * 5 + UINT32_C(0xe6546b64);
    }
    const uint8_t *tail = (const uint8_t *)key + nblocks * 4;
    uint32_t k1 = 0;
    switch (len & UINT32_C(3)) {
        case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */
        case 2: k1 ^= tail[1] << 8;  /* FALLTHROUGH */
        case 1: k1 ^= tail[0];
                k1 *= UINT32_C(0xcc9e2d51);
                k1 = (k1 << 15) | (k1 >> 17);
                k1 *= UINT32_C(0x1b873593);
                hash ^= k1;
    }
    hash ^= len;
    hash ^= hash >> 16;
    hash *= UINT32_C(0x85ebca6b);
    hash ^= hash >> 13;
    hash *= UINT32_C(0xc2b2ae35);
    hash ^= hash >> 16;
    return hash;
}

unsigned
dumb(unsigned char *key, size_t len)
{
    unsigned hash = 0;
    for (size_t i = 0; i < len; i++) {
        hash += key[i];
        hash ^= hash >> 16;
        hash *= 0xa871304d;
        hash ^= hash >> 16;
    }
    return hash;
}

unsigned char *
utf8_decode(unsigned char *s, long *c)
{
    unsigned char *next;
    if (s[0] < 0x80) {
        *c = s[0];
        next = s + 1;
    } else if ((s[0] & 0xe0) == 0xc0) {
        *c = ((long)(s[0] & 0x1f) <<  6) |
             ((long)(s[1] & 0x3f) <<  0);
        next = s + 2;
    } else if ((s[0] & 0xf0) == 0xe0) {
        *c = ((long)(s[0] & 0x0f) << 12) |
             ((long)(s[1] & 0x3f) <<  6) |
             ((long)(s[2] & 0x3f) <<  0);
        next = s + 3;
    } else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
        *c = ((long)(s[0] & 0x07) << 18) |
             ((long)(s[1] & 0x3f) << 12) |
             ((long)(s[2] & 0x3f) <<  6) |
             ((long)(s[3] & 0x3f) <<  0);
        next = s + 4;
    } else {
        *c = -1; // invalid
        next = s + 1; // skip this byte
    }
    if (*c >= 0xd800 && *c <= 0xdfff)
        *c = -1; // surrogate half
    return next;
}


int
main(void)
{
    char word[256];
    long collisions_hashcode = 0;
    long collisions_murmurhash3 = 0;
    long collisions_dumb = 0;
    unsigned long long *seen_hashcode, *seen_murmurhash3, *seen_dumb;
    seen_hashcode    = calloc(1ULL << 26, sizeof(unsigned long long));
    seen_murmurhash3 = calloc(1ULL << 26, sizeof(unsigned long long));
    seen_dumb        = calloc(1ULL << 26, sizeof(unsigned long long));
    while (fgets(word, sizeof(word), stdin)) {
        /* Convert to UTF-16 */
        long c;
        int len = 0;
        unsigned short string[256];
        unsigned char *p = (unsigned char *)word;
        for (;;) {
            p = utf8_decode(p, &c);
            if (c == -1 || c == '\n' || c == 0)
                break;
            string[len++] = c;
        }

        /* Hash as UTF-16 */
        unsigned hash_hc  = hashCode(string, len);
        if ((seen_hashcode[hash_hc / 64] >> (hash_hc % 64)) & 1) {
            collisions_hashcode++;
        } else {
            seen_hashcode[hash_hc / 64] |= 1ULL << (hash_hc % 64);
        }

        /* Hash as UTF-8 */
        unsigned hash_mh3 = murmurhash3(word, (char *)p - word, 0);
        if ((seen_murmurhash3[hash_mh3 / 64] >> (hash_mh3 % 64)) & 1) {
            collisions_murmurhash3++;
        } else {
            seen_murmurhash3[hash_mh3 / 64] |= 1ULL << (hash_mh3 % 64);
        }

        /* Hash as UTF-8 */
        unsigned hash_dumb = dumb((unsigned char *)word, (char *)p - word);
        if ((seen_dumb[hash_dumb / 64] >> (hash_dumb % 64)) & 1) {
            collisions_dumb++;
        } else {
            seen_dumb[hash_dumb / 64] |= 1ULL << (hash_dumb % 64);
        }
    }
    printf("hashCode    %ld\n", collisions_hashcode);
    printf("MurmurHash3 %ld\n", collisions_murmurhash3);
    printf("dumb        %ld\n", collisions_dumb);
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>

	unsigned
	hashCode(const unsigned short *s, size_t len)
	{
	unsigned hash = 0;
	for (size_t i = 0; i < len; i++)
	hash = 31 * hash + s[i];
	return hash;
	}

	uint32_t
	murmurhash3(const void *key, uint32_t len, uint32_t seed)
	{
	uint32_t hash = seed;
	uint32_t nblocks = len / 4;
	const uint32_t blocks = (const uint32_t )key;
	for (uint32_t i = 0; i < nblocks; i++) {
	uint32_t k = blocks[i];
	k *= UINT32_C(0xcc9e2d51);
	k = (k << 15) \| (k >> 17);
	k *= UINT32_C(0x1b873593);
	hash ^= k;
	hash = ((hash << 13) \| (hash >> 19)) * 5 + UINT32_C(0xe6546b64);
	}
	const uint8_t tail = (const uint8_t )key + nblocks * 4;
	uint32_t k1 = 0;
	switch (len & UINT32_C(3)) {
	case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */
	case 2: k1 ^= tail[1] << 8; /* FALLTHROUGH */
	case 1: k1 ^= tail[0];
	k1 *= UINT32_C(0xcc9e2d51);
	k1 = (k1 << 15) \| (k1 >> 17);
	k1 *= UINT32_C(0x1b873593);
	hash ^= k1;
	}
	hash ^= len;
	hash ^= hash >> 16;
	hash *= UINT32_C(0x85ebca6b);
	hash ^= hash >> 13;
	hash *= UINT32_C(0xc2b2ae35);
	hash ^= hash >> 16;
	return hash;
	}

	unsigned
	dumb(unsigned char *key, size_t len)
	{
	unsigned hash = 0;
	for (size_t i = 0; i < len; i++) {
	hash += key[i];
	hash ^= hash >> 16;
	hash *= 0xa871304d;
	hash ^= hash >> 16;
	}
	return hash;
	}

	unsigned char *
	utf8_decode(unsigned char s, long c)
	{
	unsigned char *next;
	if (s[0] < 0x80) {
	*c = s[0];
	next = s + 1;
	} else if ((s[0] & 0xe0) == 0xc0) {
	*c = ((long)(s[0] & 0x1f) << 6) \|
	((long)(s[1] & 0x3f) << 0);
	next = s + 2;
	} else if ((s[0] & 0xf0) == 0xe0) {
	*c = ((long)(s[0] & 0x0f) << 12) \|
	((long)(s[1] & 0x3f) << 6) \|
	((long)(s[2] & 0x3f) << 0);
	next = s + 3;
	} else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) {
	*c = ((long)(s[0] & 0x07) << 18) \|
	((long)(s[1] & 0x3f) << 12) \|
	((long)(s[2] & 0x3f) << 6) \|
	((long)(s[3] & 0x3f) << 0);
	next = s + 4;
	} else {
	*c = -1; // invalid
	next = s + 1; // skip this byte
	}
	if (c >= 0xd800 && c <= 0xdfff)
	*c = -1; // surrogate half
	return next;
	}


	int
	main(void)
	{
	char word[256];
	long collisions_hashcode = 0;
	long collisions_murmurhash3 = 0;
	long collisions_dumb = 0;
	unsigned long long seen_hashcode, seen_murmurhash3, *seen_dumb;
	seen_hashcode = calloc(1ULL << 26, sizeof(unsigned long long));
	seen_murmurhash3 = calloc(1ULL << 26, sizeof(unsigned long long));
	seen_dumb = calloc(1ULL << 26, sizeof(unsigned long long));
	while (fgets(word, sizeof(word), stdin)) {
	/* Convert to UTF-16 */
	long c;
	int len = 0;
	unsigned short string[256];
	unsigned char p = (unsigned char )word;
	for (;;) {
	p = utf8_decode(p, &c);
	if (c == -1 \|\| c == '\n' \|\| c == 0)
	break;
	string[len++] = c;
	}

	/* Hash as UTF-16 */
	unsigned hash_hc = hashCode(string, len);
	if ((seen_hashcode[hash_hc / 64] >> (hash_hc % 64)) & 1) {
	collisions_hashcode++;
	} else {
	seen_hashcode[hash_hc / 64] \|= 1ULL << (hash_hc % 64);
	}

	/* Hash as UTF-8 */
	unsigned hash_mh3 = murmurhash3(word, (char *)p - word, 0);
	if ((seen_murmurhash3[hash_mh3 / 64] >> (hash_mh3 % 64)) & 1) {
	collisions_murmurhash3++;
	} else {
	seen_murmurhash3[hash_mh3 / 64] \|= 1ULL << (hash_mh3 % 64);
	}

	/* Hash as UTF-8 */
	unsigned hash_dumb = dumb((unsigned char )word, (char )p - word);
	if ((seen_dumb[hash_dumb / 64] >> (hash_dumb % 64)) & 1) {
	collisions_dumb++;
	} else {
	seen_dumb[hash_dumb / 64] \|= 1ULL << (hash_dumb % 64);
	}
	}
	printf("hashCode %ld\n", collisions_hashcode);
	printf("MurmurHash3 %ld\n", collisions_murmurhash3);
	printf("dumb %ld\n", collisions_dumb);
	}