Skip to content

Instantly share code, notes, and snippets.

@H2CO3
Created January 13, 2015 18:20
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save H2CO3/3cb1ce182f704fdf5876 to your computer and use it in GitHub Desktop.
Save H2CO3/3cb1ce182f704fdf5876 to your computer and use it in GitHub Desktop.
SpnHashMap, rewritten in C++
//
// SpnMap: Sparkling's hash table, in C++
// Created by Arpad Goretity on 13/01/2015
//
#include <vector>
#include <utility>
#include <cstdint>
#include <climits>
#include <cassert>
#include <exception>
#if SIZE_MAX > 0xffffffffu
#if SIZE_MAX >= 0x2fffffffffffu
#define ALL_SIZES_FIT 1
#else
#define ALL_SIZES_FIT 0
#endif // SIZE_MAX >= 0x2fffffffffffu
#else
#define ALL_SIZES_FIT 0
#endif // SIZE_MAX > 0xffffffffu
template <typename KeyType, typename ValueType, typename HashFn = std::hash<KeyType>>
struct SpnMap {
protected:
struct Bucket {
KeyType key;
ValueType value;
Bucket *next;
bool occupied;
};
static const std::vector<std::size_t> sizes;
std::size_t sizeindex; // index into 'sizes' array (see below)
std::size_t keycount; // number of keys in the table
std::vector<Bucket> buckets; // actual array for key-value pairs
Bucket *find_key(Bucket *head, const KeyType &key) const {
Bucket *node = head;
assert(node != nullptr);
do {
if (node->occupied and node->key == key) {
return node;
}
node = node->next;
} while (node != nullptr);
return nullptr;
}
Bucket *find_next_empty(std::vector<Bucket> &buckets, std::size_t headidx, std::size_t allocsize) const {
std::size_t i = headidx, n = allocsize;
assert(headidx < allocsize);
while (n--) {
Bucket *node = &buckets[i];
if (not node->occupied) {
assert(node->next == nullptr);
return node;
}
i++;
// wrap around end of table
if (i >= allocsize) {
i = 0;
}
}
assert("No empty slot found, infinite loop detected! Fatal error" == 0);
return nullptr;
}
ValueType &set_value(const KeyType &key, const ValueType &val) {
// Step 0: check degenerate cases and avoid division by 0
if (sizes[sizeindex] == 0) {
// else we will need to make room for the value
expand_or_rehash(true);
}
std::size_t hash = HashFn{}(key);
std::size_t index = hash % sizes[sizeindex];
// Step 1: We search for a bucket that already contains the
// key, becaue we need to replace the old value if it exists.
Bucket *bucket = find_key(&buckets[index], key);
// If key is already found in the table:
if (bucket != nullptr) {
// The count of the keys is not changed here,
// since we are re-using an existing key if possible.
// So, don't touch key, just replace the value
bucket->value = val;
return bucket->value;
}
// Step 2: Otherwise we'll need to insert it, so we increase
// the counts and take ownership of the key and the value.
keycount++;
// When the load factor crosses 1 / phi (aka 5 / 8), we do
// a complete rehash. If it's only the number of keys that
// exceeded the maximal load factor, but the value count is
// smaller than the maximal load factor, then the bucket
// vector is not actually grown. Instead, the non-nil values
// are reinserted into a table of the same size, freeing up
// unused keys.
//
// If, however, the number of non-nil values is also greater
// than the maximal load factor, then the array of vectors
// is expanded before rehashing.
//
// This operation invalidates 'index' and 'hm->buckets'...
if (8 * keycount > 5 * sizes[sizeindex]) {
expand_or_rehash(false);
}
// ...so we just re-compute them after the reallocation.
index = hash % sizes[sizeindex];
Bucket *home = &buckets[index];
// If the home position of the key is empty, we are done
if (not home->occupied) {
assert(home->next == nullptr);
home->key = key;
home->value = val;
home->occupied = true;
return home->value;
}
// Else a collision happened, so we pick an empty/unused bucket
// near 'index'. We then fill it in and link it into the chain.
Bucket *fresh = find_next_empty(buckets, index, sizes[sizeindex]);
assert(fresh != nullptr);
assert(fresh != home); // avoid accidental circular self-references
assert(fresh->next == nullptr);
assert(not fresh->occupied);
// set key and value
fresh->key = key;
fresh->value = val;
fresh->occupied = true;
// Insert fresh bucket into 2nd place of list
fresh->next = home->next;
home->next = fresh;
return fresh->value;
}
void expand_or_rehash(bool always_expand) {
std::vector<Bucket> oldbuckets = std::move(buckets);
std::size_t oldsize = oldbuckets.size();
assert(oldsize == sizes[sizeindex]);
// check if there are indeed more values than healthy,
// or it is just that too many keys have been deleted
if (8 * keycount > 5 * oldsize || always_expand) {
sizeindex++;
if (sizeindex >= sizes.size()) {
throw std::runtime_error("exceeded maximal size of SpnMap");
}
}
// the new bucket array starts out all empty
std::vector<Bucket> newbuckets(sizes[sizeindex]);
std::size_t newsize = newbuckets.size();
// When expanding, our situation is a little bit better
// than a full-fledged insert, since we know that we are
// inserting into an empty table. Consequently, we don't
// have to check for already-existing keys, since we
// explicitly disallow duplicates during the insertion.
// We don't need to check for non-existent keys either,
// because they are trivially filtered out within the loop.
for (std::size_t i = 0; i < oldsize; i++) {
if (not oldbuckets[i].occupied) {
continue;
}
// find slot for new key-value pair
std::size_t index = HashFn{}(oldbuckets[i].key) % newsize;
Bucket *home = &newbuckets[index];
// if it's empty yet, we insert it and we're done
if (not home->occupied) {
assert(home->next == nullptr);
home->key = std::move(oldbuckets[i].key);
home->value = std::move(oldbuckets[i].value);
home->occupied = true;
continue;
}
// else we find a nearby empty slot and link it into the list
Bucket *bucket = find_next_empty(newbuckets, index, newsize);
assert(bucket != nullptr);
assert(bucket != home); // avoid circular references
assert(bucket->next == nullptr);
assert(not bucket->occupied);
// perform insertion
bucket->key = std::move(oldbuckets[i].key);
bucket->value = std::move(oldbuckets[i].value);
bucket->occupied = true;
// link it in
bucket->next = home->next;
home->next = bucket;
}
buckets = std::move(newbuckets);
}
public:
ValueType &operator[](const KeyType &key) {
std::size_t allocsize = buckets.size();
assert(allocsize == sizes[sizeindex]);
// avoid division by 0 - an empty map has no values anyway
if (allocsize == 0) {
return set_value(key, {});
}
// compute hash and get candidate bucket
std::size_t index = HashFn{}(key) % allocsize;
Bucket *head = &buckets[index]; // not nullptr
Bucket *node = find_key(head, key);
return node ? node->value : set_value(key, {});
}
SpnMap() : sizeindex(0), keycount(0) {}
std::size_t size() const { return keycount; }
};
// std::size_t spn_hashmap_next(SpnHashMap *hm, std::size_t cursor, SpnValue *key, SpnValue *val)
// {
// std::size_t size = sizes[hm->sizeindex];
// std::size_t i;
//
// for (i = cursor; i < size; i++) {
// Bucket *bucket = &hm->buckets[i];
//
// if (notnil(&bucket->value)) {
// *key = bucket->key;
// *val = bucket->value;
// return i + 1;
// }
// }
//
// return 0;
// }
// Hash table allocation sizes. These are primes of which
// the ratio asymptotically approaches phi, the golden ratio.
template <typename KeyType, typename ValueType, typename HashFn>
const std::vector<std::size_t> SpnMap<KeyType, ValueType, HashFn>::sizes {
0x0, 0x3, 0x7, 0xD,
0x17, 0x29, 0x47, 0x7F,
0xBF, 0xFB, 0x17F, 0x277,
0x43F, 0x6BB, 0xAF3, 0x11AB,
0x1CB7, 0x2EB7, 0x4BF7, 0x79FF,
0xC5FB, 0x13FFF, 0x205FF, 0x345F7,
0x549EF, 0x88FD5, 0xDD9EF, 0x1669FF,
0x2441FF, 0x3AABFF, 0x5EEDFF, 0x9999F5,
0xF887FF, 0x19221FB, 0x28AA9D9, 0x41CCBE5,
0x6A777F7, 0xAC443EF, 0x116BB9EF, 0x1C2FFDF3,
0x2D9BB9FD, 0x49CBB7EF, 0x77676FE5, 0xC13327FF,
#if ALL_SIZES_FIT
0x1389A97E3, 0x1F9CDBDF5, 0x3326855D7, 0x52C3613FF,
0x85E9E69EF, 0xD8AD47DF9, 0x15E972E7D5, 0x23744765E9,
0x395DBA4BFB, 0x5CD201B1F7, 0x962FBBFDFF, 0xF301BDADF7,
0x1893179ABF3, 0x27C333759E9, 0x40564B105F1, 0x68197E85FF9,
0xA86FC9967E5, 0x11089481C7E9, 0x1B8F911B2DFB, 0x2C98259CF549
#endif // ALL_SIZES_FIT
};
#include <string>
#include <iostream>
#include <fstream>
#include <unordered_map>
#include <chrono>
#include "SpnMap.hpp"
// for measuring time
template<typename T = std::chrono::milliseconds>
struct Stopwatch {
template<typename F, typename ...Args>
static typename T::rep measure(F func, Args&&... args) {
auto start = std::chrono::steady_clock::now();
func(std::forward<Args>(args)...);
auto duration = std::chrono::duration_cast<T>(std::chrono::steady_clock::now() - start);
return duration.count();
}
};
int main()
{
SpnMap<std::string, int> spn_hm;
std::unordered_map<std::string, int> std_um;
std::ifstream f("random.txt");
std::vector<std::string> keys(std::istream_iterator<std::string> { f }, std::istream_iterator<std::string>{});
const int N = 10000;
auto dt_spn_hm = Stopwatch<>::measure([&] {
for (int i = 0; i < N; i++) {
for (const auto &key : keys) {
spn_hm[key] += i;
}
}
});
auto dt_std_um = Stopwatch<>::measure([&] {
for (int i = 0; i < N; i++) {
for (const auto &key : keys) {
std_um[key] += i;
}
}
});
std::cout << "Sparkling: " << dt_spn_hm << std::endl;
std::cout << "std::unordered_map: " << dt_std_um << std::endl;
std::cout << "Enter a key: ";
std::string key;
std::getline(std::cin, key);
std::cout << "spn_hm[key] = " << spn_hm[key] << std::endl;
std::cout << "std_um[key] = " << std_um[key] << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment