Skip to content

Instantly share code, notes, and snippets.

@zeux
Last active September 13, 2018 14:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zeux/e271d172b820e67bebd565a9cd13de30 to your computer and use it in GitHub Desktop.
Save zeux/e271d172b820e67bebd565a9cd13de30 to your computer and use it in GitHub Desktop.
If hashing is too slow this might mean your hash table implementation is very bad.
// g++ -O3 -o uniquevalues uniquevalues.cpp -std=c++11 -Wall -Wextra -DNDEBUG
// results for N = 10000000:
// distinct_count_stdhash(values,N) : 1340.096 cycles per operation (best) 1340.096 cycles per operation (avg)
// distinct_count_goodhash(values,N) : 137.917 cycles per operation (best) 137.917 cycles per operation (avg)
// distinct_count_sort(values,N) : 258.619 cycles per operation (best) 258.619 cycles per operation (avg)
#include <iostream>
#include <algorithm>
#include <unordered_set>
#include <vector>
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include "benchmark.h"
size_t hash_value(uint64_t x)
{
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ull;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebull;
x = x ^ (x >> 31);
return x;
}
struct IntSet
{
std::vector<uint64_t> data;
size_t size;
IntSet(size_t capacity = 0): data(capacity), size(0)
{
assert((capacity & (capacity - 1)) == 0);
}
void insert(uint64_t key)
{
size_t m = data.size() - 1;
size_t h = hash_value(key) & m;
size_t i = 0;
while (data[h] != key)
{
if (data[h] == 0)
{
data[h] = key;
size++;
break;
}
i = i + 1;
h = (h + i) & m;
}
}
static size_t optimalCapacity(size_t count)
{
size_t capacity = 1;
while (count >= capacity / 2)
capacity *= 2;
return capacity;
}
};
size_t distinct_count_goodhash(const uint64_t * values, size_t howmany) {
bool zero = false;
IntSet hash(IntSet::optimalCapacity(howmany));
for (size_t i = 0; i < howmany; ++i)
if (values[i] == 0)
zero = true;
else
hash.insert(values[i]);
return zero + hash.size;
}
size_t distinct_count_stdhash(const uint64_t * values, size_t howmany) {
std::unordered_set<uint64_t> hash(values, values + howmany);
return hash.size();
}
size_t distinct_count_sort(const uint64_t * values, size_t howmany) {
std::vector<uint64_t> array(values, values + howmany);
std::sort(array.begin(), array.end());
return std::unique(array.begin(), array.end()) - array.begin();
}
void demo(size_t N) {
std::cout << "N = " << N <<std::endl;
uint64_t * values = new uint64_t[N];
for(size_t i = 0; i < N; ++i) {
values[i] = ((uint16_t)rand()) | ((uint32_t)rand() << 16) | ((uint64_t)rand()<<32) |((uint64_t)rand()<<48);
}
values[N/2] = values[0];
values[N-1] = values[1];
size_t expected = distinct_count_stdhash(values,N);
std::cout <<" expected = "<<expected << std::endl;
const int repeat = 1;
BEST_TIME(distinct_count_stdhash(values,N), expected, , repeat, N, true);
BEST_TIME(distinct_count_goodhash(values,N), expected, , repeat, N, true);
BEST_TIME(distinct_count_sort(values,N), expected, , repeat, N, true);
delete[] values;
}
int main() {
for(size_t N = 10; N < 100000000; N*=10) demo(N);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment