H2CO3/SpnMap.hpp

## SpnMap.hpp
//
// SpnMap: Sparkling's hash table, in C++
// Created by Arpad Goretity on 13/01/2015
//

#include <vector>
#include <utility>
#include <cstdint>
#include <climits>
#include <cassert>
#include <exception>


#if SIZE_MAX > 0xffffffffu
#if SIZE_MAX >= 0x2fffffffffffu
#define ALL_SIZES_FIT 1
#else
#define ALL_SIZES_FIT 0
#endif // SIZE_MAX >= 0x2fffffffffffu
#else
#define ALL_SIZES_FIT 0
#endif // SIZE_MAX > 0xffffffffu


template <typename KeyType, typename ValueType, typename HashFn = std::hash<KeyType>>
struct SpnMap {
protected:
	struct Bucket {
		KeyType key;
		ValueType value;
		Bucket *next;
		bool occupied;
	};

	static const std::vector<std::size_t> sizes;

	std::size_t sizeindex;       // index into 'sizes' array (see below)
	std::size_t keycount;        // number of keys in the table
	std::vector<Bucket> buckets; // actual array for key-value pairs


	Bucket *find_key(Bucket *head, const KeyType &key) const {
		Bucket *node = head;
		assert(node != nullptr);

		do {
			if (node->occupied and node->key == key) {
				return node;
			}

			node = node->next;
		} while (node != nullptr);

		return nullptr;
	}

	Bucket *find_next_empty(std::vector<Bucket> &buckets, std::size_t headidx, std::size_t allocsize) const {
		std::size_t i = headidx, n = allocsize;

		assert(headidx < allocsize);

		while (n--) {
			Bucket *node = &buckets[i];

			if (not node->occupied) {
				assert(node->next == nullptr);
				return node;
			}

			i++;

			// wrap around end of table
			if (i >= allocsize) {
				i = 0;
			}
		}

		assert("No empty slot found, infinite loop detected! Fatal error" == 0);
		return nullptr;
	}

	ValueType &set_value(const KeyType &key, const ValueType &val) {
		// Step 0: check degenerate cases and avoid division by 0
		if (sizes[sizeindex] == 0) {
			// else we will need to make room for the value
			expand_or_rehash(true);
		}

		std::size_t hash = HashFn{}(key);
		std::size_t index = hash % sizes[sizeindex];

		// Step 1: We search for a bucket that already contains the
		// key, becaue we need to replace the old value if it exists.
		Bucket *bucket = find_key(&buckets[index], key);

		// If key is already found in the table:
		if (bucket != nullptr) {
			// The count of the keys is not changed here,
			// since we are re-using an existing key if possible.
			// So, don't touch key, just replace the value
			bucket->value = val;
			return bucket->value;
		}

		// Step 2: Otherwise we'll need to insert it, so we increase
		// the counts and take ownership of the key and the value.
		keycount++;

		// When the load factor crosses 1 / phi (aka 5 / 8), we do
		// a complete rehash. If it's only the number of keys that
		// exceeded the maximal load factor, but the value count is
		// smaller than the maximal load factor, then the bucket
		// vector is not actually grown. Instead, the non-nil values
		// are reinserted into a table of the same size, freeing up
		// unused keys.
		//
		// If, however, the number of non-nil values is also greater
		// than the maximal load factor, then the array of vectors
		// is expanded before rehashing.
		//
		// This operation invalidates 'index' and 'hm->buckets'...
		if (8 * keycount > 5 * sizes[sizeindex]) {
			expand_or_rehash(false);
		}

		// ...so we just re-compute them after the reallocation.
		index = hash % sizes[sizeindex];
		Bucket *home = &buckets[index];

		// If the home position of the key is empty, we are done
		if (not home->occupied) {
			assert(home->next == nullptr);

			home->key = key;
			home->value = val;
			home->occupied = true;
			return home->value;
		}

		// Else a collision happened, so we pick an empty/unused bucket
		// near 'index'. We then fill it in and link it into the chain.
		Bucket *fresh = find_next_empty(buckets, index, sizes[sizeindex]);
		assert(fresh != nullptr);
		assert(fresh != home); // avoid accidental circular self-references
		assert(fresh->next == nullptr);
		assert(not fresh->occupied);

		// set key and value
		fresh->key = key;
		fresh->value = val;
		fresh->occupied = true;

		// Insert fresh bucket into 2nd place of list
		fresh->next = home->next;
		home->next = fresh;
		return fresh->value;
	}

	void expand_or_rehash(bool always_expand) {
		std::vector<Bucket> oldbuckets = std::move(buckets);
		std::size_t oldsize = oldbuckets.size();
		assert(oldsize == sizes[sizeindex]);

		// check if there are indeed more values than healthy,
		// or it is just that too many keys have been deleted
		if (8 * keycount > 5 * oldsize || always_expand) {
			sizeindex++;

			if (sizeindex >= sizes.size()) {
				throw std::runtime_error("exceeded maximal size of SpnMap");
			}
		}

		// the new bucket array starts out all empty
		std::vector<Bucket> newbuckets(sizes[sizeindex]);
		std::size_t newsize = newbuckets.size();

		// When expanding, our situation is a little bit better
		// than a full-fledged insert, since we know that we are
		// inserting into an empty table. Consequently, we don't
		// have to check for already-existing keys, since we
		// explicitly disallow duplicates during the insertion.
		// We don't need to check for non-existent keys either,
		// because they are trivially filtered out within the loop.
		for (std::size_t i = 0; i < oldsize; i++) {
			if (not oldbuckets[i].occupied) {
				continue;
			}

			// find slot for new key-value pair
			std::size_t index = HashFn{}(oldbuckets[i].key) % newsize;
			Bucket *home = &newbuckets[index];

			// if it's empty yet, we insert it and we're done
			if (not home->occupied) {
				assert(home->next == nullptr);

				home->key   = std::move(oldbuckets[i].key);
				home->value = std::move(oldbuckets[i].value);
				home->occupied = true;
				continue;
			}

			// else we find a nearby empty slot and link it into the list
			Bucket *bucket = find_next_empty(newbuckets, index, newsize);

			assert(bucket != nullptr);
			assert(bucket != home); // avoid circular references
			assert(bucket->next == nullptr);
			assert(not bucket->occupied);

			// perform insertion
			bucket->key   = std::move(oldbuckets[i].key);
			bucket->value = std::move(oldbuckets[i].value);
			bucket->occupied = true;

			// link it in
			bucket->next = home->next;
			home->next = bucket;
		}

		buckets = std::move(newbuckets);
	}

public:
	ValueType &operator[](const KeyType &key) {
		std::size_t allocsize = buckets.size();
		assert(allocsize == sizes[sizeindex]);

		// avoid division by 0 - an empty map has no values anyway
		if (allocsize == 0) {
			return set_value(key, {});
		}

		// compute hash and get candidate bucket
		std::size_t index = HashFn{}(key) % allocsize;
		Bucket *head = &buckets[index]; // not nullptr
		Bucket *node = find_key(head, key);
		return node ? node->value : set_value(key, {});
	}


	SpnMap() : sizeindex(0), keycount(0) {}

	std::size_t size() const { return keycount; }
};


// std::size_t spn_hashmap_next(SpnHashMap *hm, std::size_t cursor, SpnValue *key, SpnValue *val)
// {
// 	std::size_t size = sizes[hm->sizeindex];
// 	std::size_t i;
//
// 	for (i = cursor; i < size; i++) {
// 		Bucket *bucket = &hm->buckets[i];
//
// 		if (notnil(&bucket->value)) {
// 			*key = bucket->key;
// 			*val = bucket->value;
// 			return i + 1;
// 		}
// 	}
//
// 	return 0;
// }

// Hash table allocation sizes. These are primes of which
// the ratio asymptotically approaches phi, the golden ratio.
template <typename KeyType, typename ValueType, typename HashFn>
const std::vector<std::size_t> SpnMap<KeyType, ValueType, HashFn>::sizes {
	0x0,             0x3,             0x7,             0xD,
	0x17,            0x29,            0x47,            0x7F,
	0xBF,            0xFB,            0x17F,           0x277,
	0x43F,           0x6BB,           0xAF3,           0x11AB,
	0x1CB7,          0x2EB7,          0x4BF7,          0x79FF,
	0xC5FB,          0x13FFF,         0x205FF,         0x345F7,
	0x549EF,         0x88FD5,         0xDD9EF,         0x1669FF,
	0x2441FF,        0x3AABFF,        0x5EEDFF,        0x9999F5,
	0xF887FF,        0x19221FB,       0x28AA9D9,       0x41CCBE5,
	0x6A777F7,       0xAC443EF,       0x116BB9EF,      0x1C2FFDF3,
	0x2D9BB9FD,      0x49CBB7EF,      0x77676FE5,      0xC13327FF,
#if ALL_SIZES_FIT
	0x1389A97E3,     0x1F9CDBDF5,     0x3326855D7,     0x52C3613FF,
	0x85E9E69EF,     0xD8AD47DF9,     0x15E972E7D5,    0x23744765E9,
	0x395DBA4BFB,    0x5CD201B1F7,    0x962FBBFDFF,    0xF301BDADF7,
	0x1893179ABF3,   0x27C333759E9,   0x40564B105F1,   0x68197E85FF9,
	0xA86FC9967E5,   0x11089481C7E9,  0x1B8F911B2DFB,  0x2C98259CF549
#endif // ALL_SIZES_FIT
};

## test.cpp
#include <string>
#include <iostream>
#include <fstream>
#include <unordered_map>
#include <chrono>

#include "SpnMap.hpp"

// for measuring time

template<typename T = std::chrono::milliseconds>
struct Stopwatch {
	template<typename F, typename ...Args>
	static typename T::rep measure(F func, Args&&... args) {
		auto start = std::chrono::steady_clock::now();

		func(std::forward<Args>(args)...);

		auto duration = std::chrono::duration_cast<T>(std::chrono::steady_clock::now() - start);

		return duration.count();
	}
};

int main()
{
	SpnMap<std::string, int> spn_hm;
	std::unordered_map<std::string, int> std_um;

	std::ifstream f("random.txt");
	std::vector<std::string> keys(std::istream_iterator<std::string> { f }, std::istream_iterator<std::string>{});

	const int N = 10000;


	auto dt_spn_hm = Stopwatch<>::measure([&] {
		for (int i = 0; i < N; i++) {
			for (const auto &key : keys) {
				spn_hm[key] += i;
			}
		}
	});

	auto dt_std_um = Stopwatch<>::measure([&] {
		for (int i = 0; i < N; i++) {
			for (const auto &key : keys) {
				std_um[key] += i;
			}
		}
	});


	std::cout << "Sparkling: " << dt_spn_hm << std::endl;
	std::cout << "std::unordered_map: " << dt_std_um << std::endl;

	std::cout << "Enter a key: ";
	std::string key;
	std::getline(std::cin, key);

	std::cout << "spn_hm[key] = " << spn_hm[key] << std::endl;
	std::cout << "std_um[key] = " << std_um[key] << std::endl;

	return 0;
}
	//
	// SpnMap: Sparkling's hash table, in C++
	// Created by Arpad Goretity on 13/01/2015
	//

	#include <vector>
	#include <utility>
	#include <cstdint>
	#include <climits>
	#include <cassert>
	#include <exception>


	#if SIZE_MAX > 0xffffffffu
	#if SIZE_MAX >= 0x2fffffffffffu
	#define ALL_SIZES_FIT 1
	#else
	#define ALL_SIZES_FIT 0
	#endif // SIZE_MAX >= 0x2fffffffffffu
	#else
	#define ALL_SIZES_FIT 0
	#endif // SIZE_MAX > 0xffffffffu


	template <typename KeyType, typename ValueType, typename HashFn = std::hash<KeyType>>
	struct SpnMap {
	protected:
	struct Bucket {
	KeyType key;
	ValueType value;
	Bucket *next;
	bool occupied;
	};

	static const std::vector<std::size_t> sizes;

	std::size_t sizeindex; // index into 'sizes' array (see below)
	std::size_t keycount; // number of keys in the table
	std::vector<Bucket> buckets; // actual array for key-value pairs


	Bucket find_key(Bucket head, const KeyType &key) const {
	Bucket *node = head;
	assert(node != nullptr);

	do {
	if (node->occupied and node->key == key) {
	return node;
	}

	node = node->next;
	} while (node != nullptr);

	return nullptr;
	}

	Bucket *find_next_empty(std::vector<Bucket> &buckets, std::size_t headidx, std::size_t allocsize) const {
	std::size_t i = headidx, n = allocsize;

	assert(headidx < allocsize);

	while (n--) {
	Bucket *node = &buckets[i];

	if (not node->occupied) {
	assert(node->next == nullptr);
	return node;
	}

	i++;

	// wrap around end of table
	if (i >= allocsize) {
	i = 0;
	}
	}

	assert("No empty slot found, infinite loop detected! Fatal error" == 0);
	return nullptr;
	}

	ValueType &set_value(const KeyType &key, const ValueType &val) {
	// Step 0: check degenerate cases and avoid division by 0
	if (sizes[sizeindex] == 0) {
	// else we will need to make room for the value
	expand_or_rehash(true);
	}

	std::size_t hash = HashFn{}(key);
	std::size_t index = hash % sizes[sizeindex];

	// Step 1: We search for a bucket that already contains the
	// key, becaue we need to replace the old value if it exists.
	Bucket *bucket = find_key(&buckets[index], key);

	// If key is already found in the table:
	if (bucket != nullptr) {
	// The count of the keys is not changed here,
	// since we are re-using an existing key if possible.
	// So, don't touch key, just replace the value
	bucket->value = val;
	return bucket->value;
	}

	// Step 2: Otherwise we'll need to insert it, so we increase
	// the counts and take ownership of the key and the value.
	keycount++;

	// When the load factor crosses 1 / phi (aka 5 / 8), we do
	// a complete rehash. If it's only the number of keys that
	// exceeded the maximal load factor, but the value count is
	// smaller than the maximal load factor, then the bucket
	// vector is not actually grown. Instead, the non-nil values
	// are reinserted into a table of the same size, freeing up
	// unused keys.
	//
	// If, however, the number of non-nil values is also greater
	// than the maximal load factor, then the array of vectors
	// is expanded before rehashing.
	//
	// This operation invalidates 'index' and 'hm->buckets'...
	if (8 * keycount > 5 * sizes[sizeindex]) {
	expand_or_rehash(false);
	}

	// ...so we just re-compute them after the reallocation.
	index = hash % sizes[sizeindex];
	Bucket *home = &buckets[index];

	// If the home position of the key is empty, we are done
	if (not home->occupied) {
	assert(home->next == nullptr);

	home->key = key;
	home->value = val;
	home->occupied = true;
	return home->value;
	}

	// Else a collision happened, so we pick an empty/unused bucket
	// near 'index'. We then fill it in and link it into the chain.
	Bucket *fresh = find_next_empty(buckets, index, sizes[sizeindex]);
	assert(fresh != nullptr);
	assert(fresh != home); // avoid accidental circular self-references
	assert(fresh->next == nullptr);
	assert(not fresh->occupied);

	// set key and value
	fresh->key = key;
	fresh->value = val;
	fresh->occupied = true;

	// Insert fresh bucket into 2nd place of list
	fresh->next = home->next;
	home->next = fresh;
	return fresh->value;
	}

	void expand_or_rehash(bool always_expand) {
	std::vector<Bucket> oldbuckets = std::move(buckets);
	std::size_t oldsize = oldbuckets.size();
	assert(oldsize == sizes[sizeindex]);

	// check if there are indeed more values than healthy,
	// or it is just that too many keys have been deleted
	if (8 * keycount > 5 * oldsize \|\| always_expand) {
	sizeindex++;

	if (sizeindex >= sizes.size()) {
	throw std::runtime_error("exceeded maximal size of SpnMap");
	}
	}

	// the new bucket array starts out all empty
	std::vector<Bucket> newbuckets(sizes[sizeindex]);
	std::size_t newsize = newbuckets.size();

	// When expanding, our situation is a little bit better
	// than a full-fledged insert, since we know that we are
	// inserting into an empty table. Consequently, we don't
	// have to check for already-existing keys, since we
	// explicitly disallow duplicates during the insertion.
	// We don't need to check for non-existent keys either,
	// because they are trivially filtered out within the loop.
	for (std::size_t i = 0; i < oldsize; i++) {
	if (not oldbuckets[i].occupied) {
	continue;
	}

	// find slot for new key-value pair
	std::size_t index = HashFn{}(oldbuckets[i].key) % newsize;
	Bucket *home = &newbuckets[index];

	// if it's empty yet, we insert it and we're done
	if (not home->occupied) {
	assert(home->next == nullptr);

	home->key = std::move(oldbuckets[i].key);
	home->value = std::move(oldbuckets[i].value);
	home->occupied = true;
	continue;
	}

	// else we find a nearby empty slot and link it into the list
	Bucket *bucket = find_next_empty(newbuckets, index, newsize);

	assert(bucket != nullptr);
	assert(bucket != home); // avoid circular references
	assert(bucket->next == nullptr);
	assert(not bucket->occupied);

	// perform insertion
	bucket->key = std::move(oldbuckets[i].key);
	bucket->value = std::move(oldbuckets[i].value);
	bucket->occupied = true;

	// link it in
	bucket->next = home->next;
	home->next = bucket;
	}

	buckets = std::move(newbuckets);
	}

	public:
	ValueType &operator[](const KeyType &key) {
	std::size_t allocsize = buckets.size();
	assert(allocsize == sizes[sizeindex]);

	// avoid division by 0 - an empty map has no values anyway
	if (allocsize == 0) {
	return set_value(key, {});
	}

	// compute hash and get candidate bucket
	std::size_t index = HashFn{}(key) % allocsize;
	Bucket *head = &buckets[index]; // not nullptr
	Bucket *node = find_key(head, key);
	return node ? node->value : set_value(key, {});
	}


	SpnMap() : sizeindex(0), keycount(0) {}

	std::size_t size() const { return keycount; }
	};


	// std::size_t spn_hashmap_next(SpnHashMap hm, std::size_t cursor, SpnValue key, SpnValue *val)
	// {
	// std::size_t size = sizes[hm->sizeindex];
	// std::size_t i;
	//
	// for (i = cursor; i < size; i++) {
	// Bucket *bucket = &hm->buckets[i];
	//
	// if (notnil(&bucket->value)) {
	// *key = bucket->key;
	// *val = bucket->value;
	// return i + 1;
	// }
	// }
	//
	// return 0;
	// }

	// Hash table allocation sizes. These are primes of which
	// the ratio asymptotically approaches phi, the golden ratio.
	template <typename KeyType, typename ValueType, typename HashFn>
	const std::vector<std::size_t> SpnMap<KeyType, ValueType, HashFn>::sizes {
	0x0, 0x3, 0x7, 0xD,
	0x17, 0x29, 0x47, 0x7F,
	0xBF, 0xFB, 0x17F, 0x277,
	0x43F, 0x6BB, 0xAF3, 0x11AB,
	0x1CB7, 0x2EB7, 0x4BF7, 0x79FF,
	0xC5FB, 0x13FFF, 0x205FF, 0x345F7,
	0x549EF, 0x88FD5, 0xDD9EF, 0x1669FF,
	0x2441FF, 0x3AABFF, 0x5EEDFF, 0x9999F5,
	0xF887FF, 0x19221FB, 0x28AA9D9, 0x41CCBE5,
	0x6A777F7, 0xAC443EF, 0x116BB9EF, 0x1C2FFDF3,
	0x2D9BB9FD, 0x49CBB7EF, 0x77676FE5, 0xC13327FF,
	#if ALL_SIZES_FIT
	0x1389A97E3, 0x1F9CDBDF5, 0x3326855D7, 0x52C3613FF,
	0x85E9E69EF, 0xD8AD47DF9, 0x15E972E7D5, 0x23744765E9,
	0x395DBA4BFB, 0x5CD201B1F7, 0x962FBBFDFF, 0xF301BDADF7,
	0x1893179ABF3, 0x27C333759E9, 0x40564B105F1, 0x68197E85FF9,
	0xA86FC9967E5, 0x11089481C7E9, 0x1B8F911B2DFB, 0x2C98259CF549
	#endif // ALL_SIZES_FIT
	};
	#include <string>
	#include <iostream>
	#include <fstream>
	#include <unordered_map>
	#include <chrono>

	#include "SpnMap.hpp"

	// for measuring time

	template<typename T = std::chrono::milliseconds>
	struct Stopwatch {
	template<typename F, typename ...Args>
	static typename T::rep measure(F func, Args&&... args) {
	auto start = std::chrono::steady_clock::now();

	func(std::forward<Args>(args)...);

	auto duration = std::chrono::duration_cast<T>(std::chrono::steady_clock::now() - start);

	return duration.count();
	}
	};

	int main()
	{
	SpnMap<std::string, int> spn_hm;
	std::unordered_map<std::string, int> std_um;

	std::ifstream f("random.txt");
	std::vector<std::string> keys(std::istream_iterator<std::string> { f }, std::istream_iterator<std::string>{});

	const int N = 10000;


	auto dt_spn_hm = Stopwatch<>::measure([&] {
	for (int i = 0; i < N; i++) {
	for (const auto &key : keys) {
	spn_hm[key] += i;
	}
	}
	});

	auto dt_std_um = Stopwatch<>::measure([&] {
	for (int i = 0; i < N; i++) {
	for (const auto &key : keys) {
	std_um[key] += i;
	}
	}
	});


	std::cout << "Sparkling: " << dt_spn_hm << std::endl;
	std::cout << "std::unordered_map: " << dt_std_um << std::endl;

	std::cout << "Enter a key: ";
	std::string key;
	std::getline(std::cin, key);

	std::cout << "spn_hm[key] = " << spn_hm[key] << std::endl;
	std::cout << "std_um[key] = " << std_um[key] << std::endl;

	return 0;
	}