Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active August 20, 2020 22:01
Show Gist options
  • Save cellularmitosis/158fe46f1cad5368be56e08dd3550e7c to your computer and use it in GitHub Desktop.
Save cellularmitosis/158fe46f1cad5368be56e08dd3550e7c to your computer and use it in GitHub Desktop.
Hash table in C, part 3: hash collisions

Blog 2020/8/20

<- previous | index | next ->

Hash table in C, part 3: hash collisions

Let's learn how to implement a hash table in C! (See also part 1 and part 2).

In order to handle hash collisions, we can do a byte-for-byte comparison of the keys. To do this, each slot needs to store a reference to the key as well as its length.

hash3

 struct _HTSlot {
+    void* keyp;
+    size_t keyLength;
     size_t keyHash;
     void* valuep;
     struct _HTSlot* nextp;
 };
typedef struct _HTSlot HTSlot;

Get and set

HashTableGet and HashTableSet are both updated to perform a full byte-for-byte comparison of the keys.

-void* HashTableGet(HashTable* htp, size_t keyHash) {
+void* HashTableGet(HashTable* htp, void* keyp, size_t keyLength,
+    size_t keyHash
+) {
     size_t slotIndex = keyHash % htp->capacity;
     HTSlot* cursorp = htp->slotspp[slotIndex];
     while (cursorp != NULL) {
-        if (cursorp->keyHash == keyHash) {
+        if (cursorp->keyHash == keyHash
+            && cursorp->keyLength == keyLength
+            && memcmp(cursorp->keyp, keyp, keyLength) == 0
+        ) {
             return cursorp->valuep;
         } else {
             cursorp = cursorp->nextp;
             continue;
         }
     }
     return NULL;
 }

Additionally, HashTableSet is updated to store the key pointer and key length.

-void HashTableSet(HashTable* htp, size_t keyHash, void* valuep) {
+void HashTableSet(HashTable* htp, void* keyp, size_t keyLength,
+    size_t keyHash, void* valuep
+) {
+    size_t slotIndex = keyHash % htp->capacity;
+    HTSlot* cursorp = htp->slotspp[slotIndex];
+    while (cursorp != NULL) {
+        if (cursorp->keyHash == keyHash
+            && cursorp->keyLength == keyLength
+            && memcmp(cursorp->keyp, keyp, keyLength) == 0
+        ) {
             cursorp->valuep = valuep;
             if (cursorp->valuep == NULL && valuep != NULL) {
                 htp->population++;
             } else if (cursorp->valuep != NULL && valuep == NULL) {
                 htp->population--;
             }
             return;
         } else {
             cursorp = cursorp->nextp;
             continue;
         }
     }
 
     // no existing match?  append a new slot to the list.
     if (valuep != NULL) {
         HTSlot* newSlotp = _HTSlotMake();
+        newSlotp->keyp = keyp;
+        newSlotp->keyLength = keyLength;
         newSlotp->keyHash = keyHash;
         newSlotp->valuep = valuep;
         if (htp->slotspp[slotIndex] == NULL) {
             htp->slotspp[slotIndex] = newSlotp;
         } else {
             HTSlot* lastp = htp->slotspp[slotIndex];
             while (lastp->nextp != NULL) {
                 lastp = lastp->nextp;
                 continue;
             }
             lastp->nextp = newSlotp;
         }
         htp->population++;
     }
 }

Hash collisions mitigated!

Our hash table now handles hash collisions!

./tests
testMake... OK
testFree... OK
testSet... OK
testGet... OK
testSlotCollision... OK
testHashCollision... OK

However, this is a bit of a performance hit, as every operation performs a full memcmp of the keys. In a future part, we will look at adding a slightly unsafe set of functions which address this performance hit.

Next time

In part 4, we will look at automatically resizing the array of slot pointers.

#include "HashTable.h"
#include <stdio.h> // fprintf
#include <stdlib.h> // exit
#include <string.h> // memset
#include <stdint.h> // uint8_t
#include <stdbool.h> // true
#include <assert.h> // assert
void* dmalloc(size_t size) {
void* p = malloc(size);
if (p == NULL) {
perror("Error: malloc failed: ");
exit(1);
}
memset(p, 0, size);
return p;
}
#define malloc(x) dont_use_malloc_use_dmalloc
size_t terribleHash(void* datap, size_t length) {
size_t sum = 0;
for (size_t i=0; i < length; i++) {
uint8_t* byte = datap + i;
sum += *byte;
continue;
}
return sum;
}
static HTSlot* _HTSlotMake() {
HTSlot* sp = dmalloc(sizeof(HTSlot));
return sp;
}
HashTable* HashTableMake(size_t capacity) {
HashTable* htp = dmalloc(sizeof(HashTable));
htp->capacity = capacity;
htp->slotspp = dmalloc(sizeof(void*) * capacity);
return htp;
}
void HashTableFree(HashTable* htp) {
for (size_t i=0; i < htp->capacity; i++) {
HTSlot* slotp = htp->slotspp[i];
while (slotp != NULL) {
HTSlot* tmp = slotp;
free(slotp);
slotp = tmp->nextp;
continue;
}
}
free(htp->slotspp);
free(htp);
}
void* HashTableGet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash
) {
size_t slotIndex = keyHash % htp->capacity;
HTSlot* cursorp = htp->slotspp[slotIndex];
while (cursorp != NULL) {
if (cursorp->keyHash == keyHash
&& cursorp->keyLength == keyLength
&& memcmp(cursorp->keyp, keyp, keyLength) == 0
) {
return cursorp->valuep;
} else {
cursorp = cursorp->nextp;
continue;
}
}
return NULL;
}
void HashTableSet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash, void* valuep
) {
size_t slotIndex = keyHash % htp->capacity;
HTSlot* cursorp = htp->slotspp[slotIndex];
while (cursorp != NULL) {
if (cursorp->keyHash == keyHash
&& cursorp->keyLength == keyLength
&& memcmp(cursorp->keyp, keyp, keyLength) == 0
) {
if (cursorp->valuep == NULL && valuep != NULL) {
htp->population++;
} else if (cursorp->valuep != NULL && valuep == NULL) {
htp->population--;
}
cursorp->valuep = valuep;
return;
} else {
cursorp = cursorp->nextp;
continue;
}
}
// no existing match? append a new slot to the list.
if (valuep != NULL) {
HTSlot* newSlotp = _HTSlotMake();
newSlotp->keyp = keyp;
newSlotp->keyLength = keyLength;
newSlotp->keyHash = keyHash;
newSlotp->valuep = valuep;
if (htp->slotspp[slotIndex] == NULL) {
htp->slotspp[slotIndex] = newSlotp;
} else {
HTSlot* lastp = htp->slotspp[slotIndex];
while (lastp->nextp != NULL) {
lastp = lastp->nextp;
continue;
}
lastp->nextp = newSlotp;
}
htp->population++;
}
}
#ifndef _HASHTABLE_H_
#define _HASHTABLE_H_
#include <unistd.h> // size_t
size_t terribleHash(void* datap, size_t length);
struct _HTSlot {
void* keyp;
size_t keyLength;
size_t keyHash;
void* valuep;
struct _HTSlot* nextp;
};
typedef struct _HTSlot HTSlot;
struct _HashTable {
HTSlot** slotspp;
size_t capacity;
size_t population;
};
typedef struct _HashTable HashTable;
HashTable* HashTableMake(size_t capacity);
void HashTableFree(HashTable* htp);
void* HashTableGet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash
);
void HashTableSet(HashTable* htp, void* keyp, size_t keyLength, size_t keyHash,
void* valuep
);
#endif
default: tests
./tests
tests: HashTable.o tests.c
gcc -g -std=c99 -Wall -Werror HashTable.o tests.c -o tests
HashTable.o: HashTable.h HashTable.c
gcc -g -std=c99 -Wall -Werror -c HashTable.c
clean:
rm -f HashTable.o tests
.PHONY: default clean
#include "HashTable.h"
#include <assert.h> // assert
#include <stdio.h> // printf
#include <string.h> // strlen, strcmp
void testMake() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
assert(htp != NULL);
assert(htp->capacity == 3);
printf("OK\n");
}
void testFree() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
HashTableFree(htp);
printf("OK\n");
}
void testSet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
size_t keyLength = strlen(keyp) + 1;
size_t keyHash = terribleHash(keyp, keyLength);
char* valuep = "world";
HashTableSet(htp, keyp, keyLength, keyHash, valuep);
assert(htp->population == 1);
printf("OK\n");
}
void testGet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
size_t keyLength = strlen(keyp) + 1;
size_t keyHash = terribleHash(keyp, keyLength);
char* valuep = "world";
HashTableSet(htp, keyp, keyLength, keyHash, valuep);
char* gotp = HashTableGet(htp, keyp, keyLength, keyHash);
assert(strcmp(valuep, gotp) == 0);
printf("OK\n");
}
void testSlotCollision() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
char* key1p = "hello";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "world";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
char* key2p = "foo";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "bar";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
char* got1p = HashTableGet(htp, key1p, key1Length, key1Hash);
assert(strcmp(value1p, got1p) == 0);
char* got2p = HashTableGet(htp, key2p, key2Length, key2Hash);
assert(strcmp(value2p, got2p) == 0);
printf("OK\n");
}
void testHashCollision() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
char* key1p = "ab";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "cd";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
char* key2p = "ba";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "dc";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
char* got1p = HashTableGet(htp, key1p, key1Length, key1Hash);
assert(strcmp(value1p, got1p) == 0);
char* got2p = HashTableGet(htp, key2p, key2Length, key2Hash);
assert(strcmp(value2p, got2p) == 0);
printf("OK\n");
}
int main(int argc, char** argv) {
testMake();
testFree();
testSet();
testGet();
testSlotCollision();
testHashCollision();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment