Created
January 13, 2015 18:20
-
-
Save H2CO3/3cb1ce182f704fdf5876 to your computer and use it in GitHub Desktop.
SpnHashMap, rewritten in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// SpnMap: Sparkling's hash table, in C++ | |
// Created by Arpad Goretity on 13/01/2015 | |
// | |
#include <vector> | |
#include <utility> | |
#include <cstdint> | |
#include <climits> | |
#include <cassert> | |
#include <exception> | |
#if SIZE_MAX > 0xffffffffu | |
#if SIZE_MAX >= 0x2fffffffffffu | |
#define ALL_SIZES_FIT 1 | |
#else | |
#define ALL_SIZES_FIT 0 | |
#endif // SIZE_MAX >= 0x2fffffffffffu | |
#else | |
#define ALL_SIZES_FIT 0 | |
#endif // SIZE_MAX > 0xffffffffu | |
template <typename KeyType, typename ValueType, typename HashFn = std::hash<KeyType>> | |
struct SpnMap { | |
protected: | |
struct Bucket { | |
KeyType key; | |
ValueType value; | |
Bucket *next; | |
bool occupied; | |
}; | |
static const std::vector<std::size_t> sizes; | |
std::size_t sizeindex; // index into 'sizes' array (see below) | |
std::size_t keycount; // number of keys in the table | |
std::vector<Bucket> buckets; // actual array for key-value pairs | |
Bucket *find_key(Bucket *head, const KeyType &key) const { | |
Bucket *node = head; | |
assert(node != nullptr); | |
do { | |
if (node->occupied and node->key == key) { | |
return node; | |
} | |
node = node->next; | |
} while (node != nullptr); | |
return nullptr; | |
} | |
Bucket *find_next_empty(std::vector<Bucket> &buckets, std::size_t headidx, std::size_t allocsize) const { | |
std::size_t i = headidx, n = allocsize; | |
assert(headidx < allocsize); | |
while (n--) { | |
Bucket *node = &buckets[i]; | |
if (not node->occupied) { | |
assert(node->next == nullptr); | |
return node; | |
} | |
i++; | |
// wrap around end of table | |
if (i >= allocsize) { | |
i = 0; | |
} | |
} | |
assert("No empty slot found, infinite loop detected! Fatal error" == 0); | |
return nullptr; | |
} | |
ValueType &set_value(const KeyType &key, const ValueType &val) { | |
// Step 0: check degenerate cases and avoid division by 0 | |
if (sizes[sizeindex] == 0) { | |
// else we will need to make room for the value | |
expand_or_rehash(true); | |
} | |
std::size_t hash = HashFn{}(key); | |
std::size_t index = hash % sizes[sizeindex]; | |
// Step 1: We search for a bucket that already contains the | |
// key, becaue we need to replace the old value if it exists. | |
Bucket *bucket = find_key(&buckets[index], key); | |
// If key is already found in the table: | |
if (bucket != nullptr) { | |
// The count of the keys is not changed here, | |
// since we are re-using an existing key if possible. | |
// So, don't touch key, just replace the value | |
bucket->value = val; | |
return bucket->value; | |
} | |
// Step 2: Otherwise we'll need to insert it, so we increase | |
// the counts and take ownership of the key and the value. | |
keycount++; | |
// When the load factor crosses 1 / phi (aka 5 / 8), we do | |
// a complete rehash. If it's only the number of keys that | |
// exceeded the maximal load factor, but the value count is | |
// smaller than the maximal load factor, then the bucket | |
// vector is not actually grown. Instead, the non-nil values | |
// are reinserted into a table of the same size, freeing up | |
// unused keys. | |
// | |
// If, however, the number of non-nil values is also greater | |
// than the maximal load factor, then the array of vectors | |
// is expanded before rehashing. | |
// | |
// This operation invalidates 'index' and 'hm->buckets'... | |
if (8 * keycount > 5 * sizes[sizeindex]) { | |
expand_or_rehash(false); | |
} | |
// ...so we just re-compute them after the reallocation. | |
index = hash % sizes[sizeindex]; | |
Bucket *home = &buckets[index]; | |
// If the home position of the key is empty, we are done | |
if (not home->occupied) { | |
assert(home->next == nullptr); | |
home->key = key; | |
home->value = val; | |
home->occupied = true; | |
return home->value; | |
} | |
// Else a collision happened, so we pick an empty/unused bucket | |
// near 'index'. We then fill it in and link it into the chain. | |
Bucket *fresh = find_next_empty(buckets, index, sizes[sizeindex]); | |
assert(fresh != nullptr); | |
assert(fresh != home); // avoid accidental circular self-references | |
assert(fresh->next == nullptr); | |
assert(not fresh->occupied); | |
// set key and value | |
fresh->key = key; | |
fresh->value = val; | |
fresh->occupied = true; | |
// Insert fresh bucket into 2nd place of list | |
fresh->next = home->next; | |
home->next = fresh; | |
return fresh->value; | |
} | |
void expand_or_rehash(bool always_expand) { | |
std::vector<Bucket> oldbuckets = std::move(buckets); | |
std::size_t oldsize = oldbuckets.size(); | |
assert(oldsize == sizes[sizeindex]); | |
// check if there are indeed more values than healthy, | |
// or it is just that too many keys have been deleted | |
if (8 * keycount > 5 * oldsize || always_expand) { | |
sizeindex++; | |
if (sizeindex >= sizes.size()) { | |
throw std::runtime_error("exceeded maximal size of SpnMap"); | |
} | |
} | |
// the new bucket array starts out all empty | |
std::vector<Bucket> newbuckets(sizes[sizeindex]); | |
std::size_t newsize = newbuckets.size(); | |
// When expanding, our situation is a little bit better | |
// than a full-fledged insert, since we know that we are | |
// inserting into an empty table. Consequently, we don't | |
// have to check for already-existing keys, since we | |
// explicitly disallow duplicates during the insertion. | |
// We don't need to check for non-existent keys either, | |
// because they are trivially filtered out within the loop. | |
for (std::size_t i = 0; i < oldsize; i++) { | |
if (not oldbuckets[i].occupied) { | |
continue; | |
} | |
// find slot for new key-value pair | |
std::size_t index = HashFn{}(oldbuckets[i].key) % newsize; | |
Bucket *home = &newbuckets[index]; | |
// if it's empty yet, we insert it and we're done | |
if (not home->occupied) { | |
assert(home->next == nullptr); | |
home->key = std::move(oldbuckets[i].key); | |
home->value = std::move(oldbuckets[i].value); | |
home->occupied = true; | |
continue; | |
} | |
// else we find a nearby empty slot and link it into the list | |
Bucket *bucket = find_next_empty(newbuckets, index, newsize); | |
assert(bucket != nullptr); | |
assert(bucket != home); // avoid circular references | |
assert(bucket->next == nullptr); | |
assert(not bucket->occupied); | |
// perform insertion | |
bucket->key = std::move(oldbuckets[i].key); | |
bucket->value = std::move(oldbuckets[i].value); | |
bucket->occupied = true; | |
// link it in | |
bucket->next = home->next; | |
home->next = bucket; | |
} | |
buckets = std::move(newbuckets); | |
} | |
public: | |
ValueType &operator[](const KeyType &key) { | |
std::size_t allocsize = buckets.size(); | |
assert(allocsize == sizes[sizeindex]); | |
// avoid division by 0 - an empty map has no values anyway | |
if (allocsize == 0) { | |
return set_value(key, {}); | |
} | |
// compute hash and get candidate bucket | |
std::size_t index = HashFn{}(key) % allocsize; | |
Bucket *head = &buckets[index]; // not nullptr | |
Bucket *node = find_key(head, key); | |
return node ? node->value : set_value(key, {}); | |
} | |
SpnMap() : sizeindex(0), keycount(0) {} | |
std::size_t size() const { return keycount; } | |
}; | |
// std::size_t spn_hashmap_next(SpnHashMap *hm, std::size_t cursor, SpnValue *key, SpnValue *val) | |
// { | |
// std::size_t size = sizes[hm->sizeindex]; | |
// std::size_t i; | |
// | |
// for (i = cursor; i < size; i++) { | |
// Bucket *bucket = &hm->buckets[i]; | |
// | |
// if (notnil(&bucket->value)) { | |
// *key = bucket->key; | |
// *val = bucket->value; | |
// return i + 1; | |
// } | |
// } | |
// | |
// return 0; | |
// } | |
// Hash table allocation sizes. These are primes of which | |
// the ratio asymptotically approaches phi, the golden ratio. | |
template <typename KeyType, typename ValueType, typename HashFn> | |
const std::vector<std::size_t> SpnMap<KeyType, ValueType, HashFn>::sizes { | |
0x0, 0x3, 0x7, 0xD, | |
0x17, 0x29, 0x47, 0x7F, | |
0xBF, 0xFB, 0x17F, 0x277, | |
0x43F, 0x6BB, 0xAF3, 0x11AB, | |
0x1CB7, 0x2EB7, 0x4BF7, 0x79FF, | |
0xC5FB, 0x13FFF, 0x205FF, 0x345F7, | |
0x549EF, 0x88FD5, 0xDD9EF, 0x1669FF, | |
0x2441FF, 0x3AABFF, 0x5EEDFF, 0x9999F5, | |
0xF887FF, 0x19221FB, 0x28AA9D9, 0x41CCBE5, | |
0x6A777F7, 0xAC443EF, 0x116BB9EF, 0x1C2FFDF3, | |
0x2D9BB9FD, 0x49CBB7EF, 0x77676FE5, 0xC13327FF, | |
#if ALL_SIZES_FIT | |
0x1389A97E3, 0x1F9CDBDF5, 0x3326855D7, 0x52C3613FF, | |
0x85E9E69EF, 0xD8AD47DF9, 0x15E972E7D5, 0x23744765E9, | |
0x395DBA4BFB, 0x5CD201B1F7, 0x962FBBFDFF, 0xF301BDADF7, | |
0x1893179ABF3, 0x27C333759E9, 0x40564B105F1, 0x68197E85FF9, | |
0xA86FC9967E5, 0x11089481C7E9, 0x1B8F911B2DFB, 0x2C98259CF549 | |
#endif // ALL_SIZES_FIT | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <iostream> | |
#include <fstream> | |
#include <unordered_map> | |
#include <chrono> | |
#include "SpnMap.hpp" | |
// for measuring time | |
template<typename T = std::chrono::milliseconds> | |
struct Stopwatch { | |
template<typename F, typename ...Args> | |
static typename T::rep measure(F func, Args&&... args) { | |
auto start = std::chrono::steady_clock::now(); | |
func(std::forward<Args>(args)...); | |
auto duration = std::chrono::duration_cast<T>(std::chrono::steady_clock::now() - start); | |
return duration.count(); | |
} | |
}; | |
int main() | |
{ | |
SpnMap<std::string, int> spn_hm; | |
std::unordered_map<std::string, int> std_um; | |
std::ifstream f("random.txt"); | |
std::vector<std::string> keys(std::istream_iterator<std::string> { f }, std::istream_iterator<std::string>{}); | |
const int N = 10000; | |
auto dt_spn_hm = Stopwatch<>::measure([&] { | |
for (int i = 0; i < N; i++) { | |
for (const auto &key : keys) { | |
spn_hm[key] += i; | |
} | |
} | |
}); | |
auto dt_std_um = Stopwatch<>::measure([&] { | |
for (int i = 0; i < N; i++) { | |
for (const auto &key : keys) { | |
std_um[key] += i; | |
} | |
} | |
}); | |
std::cout << "Sparkling: " << dt_spn_hm << std::endl; | |
std::cout << "std::unordered_map: " << dt_std_um << std::endl; | |
std::cout << "Enter a key: "; | |
std::string key; | |
std::getline(std::cin, key); | |
std::cout << "spn_hm[key] = " << spn_hm[key] << std::endl; | |
std::cout << "std_um[key] = " << std_um[key] << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment