Skip to content

Instantly share code, notes, and snippets.

@TheRayTracer
Created April 1, 2014 04:56
Show Gist options
  • Save TheRayTracer/9907950 to your computer and use it in GitHub Desktop.
Save TheRayTracer/9907950 to your computer and use it in GitHub Desktop.
A simple implementation of a Bloom Filter using two hash functions. A Bloom Filter is used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not; i.e. a query returns either "possibly in set" or "definitely not in set". This sample shows an example of a false positive.
#include <iostream>
#include <bitset>
#include <vector>
#include <cassert>
namespace std { }
using namespace std;
typedef size_t (*pHashFunc)(const char*);
size_t hash_jenkins(const char* s)
{
size_t hash = 0;
while (*s != '\0')
{
hash += *s++;
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
size_t hash_pearson(const char* s)
{
static const unsigned char T[256] = {
98, 6, 85,150, 36, 23,112,164,135,207,169, 5, 26, 64,165,219, // 1
61, 20, 68, 89,130, 63, 52,102, 24,229,132,245, 80,216,195,115, // 2
90,168,156,203,177,120, 2,190,188, 7,100,185,174,243,162, 10, // 3
237, 18,253,225, 8,208,172,244,255,126,101, 79,145,235,228,121, // 4
123,251, 67,250,161, 0,107, 97,241,111,181, 82,249, 33, 69, 55, // 5
59,153, 29, 9,213,167, 84, 93, 30, 46, 94, 75,151,114, 73,222, // 6
197, 96,210, 45, 16,227,248,202, 51,152,252,125, 81,206,215,186, // 7
39,158,178,187,131,136, 1, 49, 50, 17,141, 91, 47,129, 60, 99, // 8
154, 35, 86,171,105, 34, 38,200,147, 58, 77,118,173,246, 76,254, // 9
133,232,196,144,198,124, 53, 4,108, 74,223,234,134,230,157,139, // 10
189,205,199,128,176, 19,211,236,127,192,231, 70,233, 88,146, 44, // 11
183,201, 22, 83, 13,214,116,109,159, 32, 95,226,140,220, 57, 12, // 12
221, 31,209,182,143, 92,149,184,148, 62,113, 65, 37, 27,106,166, // 13
3, 14,204, 72, 21, 41, 56, 66, 28,193, 40,217, 25, 54,179,117, // 14
238, 87,240,155,180,170,242,212,191,163, 78,218,137,194,175,110, // 15
43,119,224, 71,122,142, 42,160,104, 48,247,103, 15, 11,138,239 // 16
};
size_t hash = 0;
while (*s != '\0')
{
size_t index = hash ^ *s++;
hash = T[index];
}
return hash;
}
template<size_t NumberOfBits>
class bloom_filter
{
public:
void add_hash_function(pHashFunc hash);
void clear_hash_functions();
void add(const char* element);
bool contains(const char* element);
private:
vector<pHashFunc> hash_functions;
bitset<NumberOfBits> store;
};
template<size_t NumberOfBits>
void bloom_filter<NumberOfBits>::add_hash_function(pHashFunc hash)
{
hash_functions.push_back(hash);
return;
}
template<size_t NumberOfBits>
void bloom_filter<NumberOfBits>::clear_hash_functions()
{
hash_functions.clear();
return;
}
template<size_t NumberOfBits>
void bloom_filter<NumberOfBits>::add(const char* element)
{
for (size_t i = 0; i < hash_functions.size(); ++i)
{
size_t hash = hash_functions[i](element);
size_t bit = hash % NumberOfBits;
store.set(bit);
}
return;
}
template<size_t NumberOfBits>
bool bloom_filter<NumberOfBits>::contains(const char* element)
{
for (size_t i = 0; i < hash_functions.size(); ++i)
{
size_t hash = hash_functions[i](element);
size_t bit = hash % NumberOfBits;
if (store.test(bit) == false)
{
return false;
}
}
return true; // Possibly in set.
}
int main(int argc, char* argv[])
{
bloom_filter<10000> bloom;
bloom.add_hash_function(hash_jenkins);
bloom.add_hash_function(hash_pearson);
bloom.add("afa");
bloom.add("arm");
bloom.add("cat");
bloom.add("dig");
bloom.add("dog");
assert(bloom.contains("owl") == false); // Not in the list (TN).
assert(bloom.contains("cat") != false); // In the list (TP).
assert(bloom.contains("icv") != false); // Not in the list (FP) - "afa" and "icv" set the same bits using both the Jenkin Hash function, and the Pearson Hash function.
cout << "Bloom filter checks were successful." << endl;
cin.get();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment