Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active May 10, 2023 02:46
Show Gist options
  • Save cellularmitosis/9db34df37d3df91709ad3e4faf93c417 to your computer and use it in GitHub Desktop.
Save cellularmitosis/9db34df37d3df91709ad3e4faf93c417 to your computer and use it in GitHub Desktop.
Hash table in C, part 1: a humble beginning

Blog 2020/8/20

<- previous | index | next ->

Hash table in C, part 1: a humble beginning

Let's learn how to implement a hash table in C!

The basic concept of a hash table is to store key-value relationships in an array of slots. A hashing function is used to turn the key into a slot index.

Because the core of the data structure is an array, all of the hash table operations are O(1) time.

The simplest place to start is a humble array of void* pointers.

hash1

As we will see, this is not a good hash table, but it makes for a good conceptual starting place.

The HashTable structure

Our initial hash table structure has:

  • a pointer to an array of pointers to values
  • the length of that array
  • how many slots in the array are occupied
struct _HashTable {
    void** slotspp;
    size_t capacity;
    size_t population;
};
typedef struct _HashTable HashTable;

A couple of functions to create and destory a HashTable.

HashTable* HashTableMake(size_t capacity) {
    HashTable* htp = dmalloc(sizeof(HashTable));
    htp->capacity = capacity;
    htp->slotspp = dmalloc(sizeof(void*) * capacity);
    return htp;
}
void HashTableFree(HashTable* htp) {
    free(htp->slotspp);
    free(htp);
}

Minutiae: exit on malloc failure

For now, we aren't interested in being pedantic with error handling. If malloc fails, just bail.

void* dmalloc(size_t size) {
    void* p = malloc(size);
    if (p == NULL) {
        perror("Error: malloc failed: ");
        exit(1);
    }
    memset(p, 0, size);
    return p;
}
#define malloc(x) dont_use_malloc_use_dmalloc

The core HashTable API: get and set

Here we see the core of the hash table concept: values are accessed using a key's hash value, which is modded by the size of the array.

void* HashTableGet(HashTable* htp, size_t keyHash) {
    size_t slotIndex = keyHash % htp->capacity;
    return htp->slotspp[slotIndex];
}

Setting a value is essentially the same, with some additional population bookkeeping.

void HashTableSet(HashTable* htp, size_t keyHash, void* valuep) {
    size_t slotIndex = keyHash % htp->capacity;
    if (htp->slotspp[slotIndex] == NULL && valuep != NULL) {
        htp->population++;
    } else if (htp->slotspp[slotIndex] != NULL && valuep == NULL) {
        htp->population--;
    }
    htp->slotspp[slotIndex] = valuep;
}

The hashing function

A hashing function reduces a piece of data (a key) to an integer.

We'll start with the simplest possible hashing function: just sum all of the bytes into an integer. Let's name this function after its performance: terrible!

size_t terribleHash(void* datap, size_t length) {
    size_t sum = 0;
    for (size_t i=0; i < length; i++) {
        uint8_t* byte = datap + i;
        sum += *byte;
        continue;
    }
    return sum;
}

testSet, testGet

Let's write a few tests to ensure setting and getting values works correctly.

void testSet() {
    printf("%s... ", __FUNCTION__);
    size_t capacity = 3;
    HashTable* htp = HashTableMake(capacity);
    char* keyp = "hello";
    char* valuep = "world";
    size_t keyHash = terribleHash(keyp, strlen(keyp)+1);
    HashTableSet(htp, keyHash, valuep);
    assert(htp->population == 1);
    printf("OK\n");
}
void testGet() {
    printf("%s... ", __FUNCTION__);
    size_t capacity = 3;
    HashTable* htp = HashTableMake(capacity);
    char* keyp = "hello";
    char* valuep = "world";
    size_t keyHash = terribleHash(keyp, strlen(keyp)+1);
    HashTableSet(htp, keyHash, valuep);
    char* gotp = HashTableGet(htp, keyHash);
    assert(strcmp(valuep, gotp) == 0);
    printf("OK\n");
}

Slot collisions: the limitation of this design

Our next test illustrates the most obvious limitation of this design: if two keys hash to the same slot, we have no way to mitigate this "slot collision".

The easiest way to trigger this problem is by creating a hash table with capacity 1, then attempting to store two values. Storing the second value causes the first value to be lost.

void testSlotCollision() {
    printf("%s... ", __FUNCTION__);
    size_t capacity = 1;
    HashTable* htp = HashTableMake(capacity);

    char* key1p = "hello";
    char* value1p = "world";
    size_t key1Hash = terribleHash(key1p, strlen(key1p)+1);
    HashTableSet(htp, key1Hash, value1p);
    assert(htp->population == 1);

    char* key2p = "foo";
    char* value2p = "bar";
    size_t key2Hash = terribleHash(key2p, strlen(key2p)+1);
    HashTableSet(htp, key2Hash, value2p);
    assert(htp->population == 2);

    char* got1p = HashTableGet(htp, key1Hash);
    assert(strcmp(value1p, got1p) == 0);

    char* got2p = HashTableGet(htp, key2Hash);
    assert(strcmp(value2p, got2p) == 0);

    printf("OK\n");
}
tests: tests.c:64: testSlotCollision: Assertion `htp->population == 2' failed.

Next time

In part 2, we'll introduce the concept of chaining to resolve this.

#include "HashTable.h"
#include <stdio.h> // fprintf
#include <stdlib.h> // exit
#include <string.h> // memset
#include <stdint.h> // uint8_t
void* dmalloc(size_t size) {
void* p = malloc(size);
if (p == NULL) {
perror("Error: malloc failed: ");
exit(1);
}
memset(p, 0, size);
return p;
}
#define malloc(x) dont_use_malloc_use_dmalloc
size_t terribleHash(void* datap, size_t length) {
size_t sum = 0;
for (size_t i=0; i < length; i++) {
uint8_t* byte = datap + i;
sum += *byte;
continue;
}
return sum;
}
HashTable* HashTableMake(size_t capacity) {
HashTable* htp = dmalloc(sizeof(HashTable));
htp->capacity = capacity;
htp->slotspp = dmalloc(sizeof(void*) * capacity);
return htp;
}
void HashTableFree(HashTable* htp) {
free(htp->slotspp);
free(htp);
}
void* HashTableGet(HashTable* htp, size_t keyHash) {
size_t slotIndex = keyHash % htp->capacity;
return htp->slotspp[slotIndex];
}
void HashTableSet(HashTable* htp, size_t keyHash, void* valuep) {
size_t slotIndex = keyHash % htp->capacity;
if (htp->slotspp[slotIndex] == NULL && valuep != NULL) {
htp->population++;
} else if (htp->slotspp[slotIndex] != NULL && valuep == NULL) {
htp->population--;
}
htp->slotspp[slotIndex] = valuep;
}
#ifndef _HASHTABLE_H_
#define _HASHTABLE_H_
#include <unistd.h> // size_t
size_t terribleHash(void* datap, size_t length);
struct _HashTable {
void** slotspp;
size_t capacity;
size_t population;
};
typedef struct _HashTable HashTable;
HashTable* HashTableMake(size_t capacity);
void HashTableFree(HashTable* htp);
void* HashTableGet(HashTable* htp, size_t keyHash);
void HashTableSet(HashTable* htp, size_t keyHash, void* valuep);
#endif
default: tests
./tests
tests: HashTable.o tests.c
gcc -g -std=c99 -Wall -Werror HashTable.o tests.c -o tests
HashTable.o: HashTable.h HashTable.c
gcc -g -std=c99 -Wall -Werror -c HashTable.c
clean:
rm -f HashTable.o tests
.PHONY: default clean
#include "HashTable.h"
#include <assert.h> // assert
#include <stdio.h> // printf
#include <string.h> // strlen, strcmp
void testMake() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
assert(htp != NULL);
assert(htp->capacity == 3);
printf("OK\n");
}
void testFree() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
HashTableFree(htp);
printf("OK\n");
}
void testSet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
char* valuep = "world";
size_t keyHash = terribleHash(keyp, strlen(keyp)+1);
HashTableSet(htp, keyHash, valuep);
assert(htp->population == 1);
printf("OK\n");
}
void testGet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
char* valuep = "world";
size_t keyHash = terribleHash(keyp, strlen(keyp)+1);
HashTableSet(htp, keyHash, valuep);
char* gotp = HashTableGet(htp, keyHash);
assert(strcmp(valuep, gotp) == 0);
printf("OK\n");
}
void testSlotCollision() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
char* key1p = "hello";
char* value1p = "world";
size_t key1Hash = terribleHash(key1p, strlen(key1p)+1);
HashTableSet(htp, key1Hash, value1p);
assert(htp->population == 1);
char* key2p = "foo";
char* value2p = "bar";
size_t key2Hash = terribleHash(key2p, strlen(key2p)+1);
HashTableSet(htp, key2Hash, value2p);
assert(htp->population == 2);
char* got1p = HashTableGet(htp, key1Hash);
assert(strcmp(value1p, got1p) == 0);
char* got2p = HashTableGet(htp, key2Hash);
assert(strcmp(value2p, got2p) == 0);
printf("OK\n");
}
int main(int argc, char** argv) {
testMake();
testFree();
testSet();
testGet();
testSlotCollision();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment