Created
September 2, 2011 17:32
-
-
Save dustin/1189242 to your computer and use it in GitHub Desktop.
Generate a big pile of random data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#* | |
*.o | |
*~ | |
gen_usernames | |
usernames.txt | |
usernames.txt.tmp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
********************************************************************* | |
* * | |
* Open Bloom Filter * | |
* * | |
* Author: Arash Partow - 2000 * | |
* URL: http://www.partow.net * | |
* URL: http://www.partow.net/programming/hashfunctions/index.html * | |
* * | |
* Copyright notice: * | |
* Free use of the Open Bloom Filter Library is permitted under the * | |
* guidelines and in accordance with the most current version of the * | |
* Common Public License. * | |
* http://www.opensource.org/licenses/cpl1.0.php * | |
* * | |
********************************************************************* | |
*/ | |
#ifndef INCLUDE_BLOOM_FILTER_HPP | |
#define INCLUDE_BLOOM_FILTER_HPP | |
#include <cstddef> | |
#include <algorithm> | |
#include <cmath> | |
#include <limits> | |
#include <string> | |
#include <vector> | |
static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) | |
static const unsigned char bit_mask[bits_per_char] = { | |
0x01, //00000001 | |
0x02, //00000010 | |
0x04, //00000100 | |
0x08, //00001000 | |
0x10, //00010000 | |
0x20, //00100000 | |
0x40, //01000000 | |
0x80 //10000000 | |
}; | |
class bloom_filter | |
{ | |
protected: | |
typedef unsigned int bloom_type; | |
typedef unsigned char cell_type; | |
public: | |
bloom_filter(const std::size_t& predicted_element_count, | |
const double& false_positive_probability, | |
const std::size_t& random_seed) | |
: bit_table_(0), | |
predicted_element_count_(predicted_element_count), | |
inserted_element_count_(0), | |
random_seed_((random_seed) ? random_seed : 0xA5A5A5A5), | |
desired_false_positive_probability_(false_positive_probability) | |
{ | |
find_optimal_parameters(); | |
generate_unique_salt(); | |
bit_table_ = new cell_type[table_size_ / bits_per_char]; | |
std::fill_n(bit_table_,(table_size_ / bits_per_char),0x00); | |
} | |
bloom_filter(const bloom_filter& filter) | |
{ | |
this->operator=(filter); | |
} | |
bloom_filter& operator = (const bloom_filter& filter) | |
{ | |
salt_count_ = filter.salt_count_; | |
table_size_ = filter.table_size_; | |
predicted_element_count_ = filter.predicted_element_count_; | |
inserted_element_count_ = filter.inserted_element_count_; | |
random_seed_ = filter.random_seed_; | |
desired_false_positive_probability_ = filter.desired_false_positive_probability_; | |
delete[] bit_table_; | |
bit_table_ = new cell_type[table_size_ / bits_per_char]; | |
std::copy(filter.bit_table_,filter.bit_table_ + (table_size_ / bits_per_char),bit_table_); | |
salt_ = filter.salt_; | |
return *this; | |
} | |
virtual ~bloom_filter() | |
{ | |
delete[] bit_table_; | |
} | |
inline bool operator!() const | |
{ | |
return (0 == table_size_); | |
} | |
inline void clear() | |
{ | |
std::fill_n(bit_table_,(table_size_ / bits_per_char),0x00); | |
inserted_element_count_ = 0; | |
} | |
inline void insert(const unsigned char* key_begin, const std::size_t& length) | |
{ | |
std::size_t bit_index = 0; | |
std::size_t bit = 0; | |
for(std::vector<bloom_type>::iterator itr = salt_.begin(); itr != salt_.end(); ++itr) | |
{ | |
compute_indices(hash_ap(key_begin,length,(*itr)),bit_index,bit); | |
bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; | |
} | |
++inserted_element_count_; | |
} | |
template<typename T> | |
inline void insert(const T& t) | |
{ | |
// Note: T must be a C++ POD type. | |
insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T)); | |
} | |
inline void insert(const std::string& key) | |
{ | |
insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); | |
} | |
inline void insert(const char* data, const std::size_t& length) | |
{ | |
insert(reinterpret_cast<const unsigned char*>(data),length); | |
} | |
template<typename InputIterator> | |
inline void insert(const InputIterator begin, const InputIterator end) | |
{ | |
InputIterator itr = begin; | |
while(itr != end) | |
{ | |
insert(*(itr++)); | |
} | |
} | |
inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const | |
{ | |
std::size_t bit_index = 0; | |
std::size_t bit = 0; | |
for(std::vector<bloom_type>::const_iterator it = salt_.begin(); it != salt_.end(); ++it) | |
{ | |
compute_indices(hash_ap(key_begin,length,(*it)),bit_index,bit); | |
if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) | |
{ | |
return false; | |
} | |
} | |
return true; | |
} | |
template<typename T> | |
inline bool contains(const T& t) const | |
{ | |
return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T))); | |
} | |
inline bool contains(const std::string& key) const | |
{ | |
return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); | |
} | |
inline bool contains(const char* data, const std::size_t& length) const | |
{ | |
return contains(reinterpret_cast<const unsigned char*>(data),length); | |
} | |
template<typename InputIterator> | |
inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const | |
{ | |
InputIterator itr = begin; | |
while(itr != end) | |
{ | |
if (!contains(*itr)) | |
{ | |
return itr; | |
} | |
++itr; | |
} | |
return end; | |
} | |
template<typename InputIterator> | |
inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const | |
{ | |
InputIterator itr = begin; | |
while(itr != end) | |
{ | |
if (contains(*itr)) | |
{ | |
return itr; | |
} | |
++itr; | |
} | |
return end; | |
} | |
inline virtual std::size_t size() const | |
{ | |
return table_size_; | |
} | |
inline std::size_t element_count() const | |
{ | |
return inserted_element_count_; | |
} | |
inline double effective_fpp() const | |
{ | |
/* | |
Note: | |
The effective false positive probability is calculated using the | |
designated table size and hash function count in conjunction with | |
the current number of inserted elements - not the user defined | |
predicated/expected number of inserted elements. | |
*/ | |
return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); | |
} | |
bloom_filter& operator &= (const bloom_filter& filter) | |
{ | |
/* intersection */ | |
if ( | |
(salt_count_ == filter.salt_count_) && | |
(table_size_ == filter.table_size_) && | |
(random_seed_ == filter.random_seed_) | |
) | |
{ | |
for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i) | |
{ | |
bit_table_[i] &= filter.bit_table_[i]; | |
} | |
} | |
return *this; | |
} | |
bloom_filter& operator |= (const bloom_filter& filter) | |
{ | |
/* union */ | |
if ( | |
(salt_count_ == filter.salt_count_) && | |
(table_size_ == filter.table_size_) && | |
(random_seed_ == filter.random_seed_) | |
) | |
{ | |
for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i) | |
{ | |
bit_table_[i] |= filter.bit_table_[i]; | |
} | |
} | |
return *this; | |
} | |
bloom_filter& operator ^= (const bloom_filter& filter) | |
{ | |
/* difference */ | |
if ( | |
(salt_count_ == filter.salt_count_) && | |
(table_size_ == filter.table_size_) && | |
(random_seed_ == filter.random_seed_) | |
) | |
{ | |
for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i) | |
{ | |
bit_table_[i] ^= filter.bit_table_[i]; | |
} | |
} | |
return *this; | |
} | |
const cell_type* table() const { return bit_table_; } | |
protected: | |
inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const | |
{ | |
bit_index = hash % table_size_; | |
bit = bit_index % bits_per_char; | |
} | |
void generate_unique_salt() | |
{ | |
/* | |
Note: | |
A distinct hash function need not be implementation-wise | |
distinct. In the current implementation "seeding" a common | |
hash function with different values seems to be adequate. | |
*/ | |
const unsigned int predef_salt_count = 128; | |
static const bloom_type predef_salt[predef_salt_count] = | |
{ | |
0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, | |
0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B, | |
0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66, | |
0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA, | |
0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99, | |
0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33, | |
0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5, | |
0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000, | |
0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F, | |
0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63, | |
0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7, | |
0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492, | |
0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A, | |
0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B, | |
0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3, | |
0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432, | |
0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC, | |
0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB, | |
0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331, | |
0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68, | |
0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8, | |
0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A, | |
0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF, | |
0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E, | |
0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39, | |
0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E, | |
0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355, | |
0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E, | |
0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79, | |
0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075, | |
0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC, | |
0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421 | |
}; | |
if (salt_count_ <= predef_salt_count) | |
{ | |
std::copy(predef_salt, | |
predef_salt + salt_count_, | |
std::back_inserter(salt_)); | |
for(unsigned int i = 0; i < salt_.size(); ++i) | |
{ | |
/* | |
Note: | |
This is done to integrate the user defined random seed, | |
so as to allow for the generation of unique bloom filter | |
instances. | |
*/ | |
salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_; | |
} | |
} | |
else | |
{ | |
std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); | |
srand(static_cast<unsigned int>(random_seed_)); | |
while(salt_.size() < salt_count_) | |
{ | |
bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand()); | |
if (0 == current_salt) continue; | |
if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) | |
{ | |
salt_.push_back(current_salt); | |
} | |
} | |
} | |
} | |
void find_optimal_parameters() | |
{ | |
/* | |
Note: | |
The following will attempt to find the number of hash functions | |
and minimum amount of storage bits required to construct a bloom | |
filter consistent with the user defined false positive probability | |
and estimated element insertion count. | |
*/ | |
double min_m = std::numeric_limits<double>::infinity(); | |
double min_k = 0.0; | |
double curr_m = 0.0; | |
for(double k = 0.0; k < 1000.0; ++k) | |
{ | |
if ((curr_m = ((- k * predicted_element_count_) / std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k)))) < min_m) | |
{ | |
min_m = curr_m; | |
min_k = k; | |
} | |
} | |
salt_count_ = static_cast<std::size_t>(min_k); | |
table_size_ = static_cast<std::size_t>(min_m); | |
table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0); | |
} | |
bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const | |
{ | |
const unsigned char* it = begin; | |
while(remaining_length >= 2) | |
{ | |
hash ^= (hash << 7) ^ (*it++) * (hash >> 3); | |
hash ^= (~((hash << 11) + ((*it++) ^ (hash >> 5)))); | |
remaining_length -= 2; | |
} | |
if (remaining_length) | |
{ | |
hash ^= (hash << 7) ^ (*it) * (hash >> 3); | |
} | |
return hash; | |
} | |
std::vector<bloom_type> salt_; | |
unsigned char* bit_table_; | |
std::size_t salt_count_; | |
std::size_t table_size_; | |
std::size_t predicted_element_count_; | |
std::size_t inserted_element_count_; | |
std::size_t random_seed_; | |
double desired_false_positive_probability_; | |
}; | |
bloom_filter operator & (const bloom_filter& a, const bloom_filter& b) | |
{ | |
bloom_filter result = a; | |
result &= b; | |
return result; | |
} | |
bloom_filter operator | (const bloom_filter& a, const bloom_filter& b) | |
{ | |
bloom_filter result = a; | |
result |= b; | |
return result; | |
} | |
bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b) | |
{ | |
bloom_filter result = a; | |
result ^= b; | |
return result; | |
} | |
class compressible_bloom_filter : public bloom_filter | |
{ | |
public: | |
compressible_bloom_filter(const std::size_t& predicted_element_count, | |
const double& false_positive_probability, | |
const std::size_t& random_seed) | |
: bloom_filter(predicted_element_count,false_positive_probability,random_seed) | |
{ | |
size_list.push_back(table_size_); | |
} | |
inline virtual std::size_t size() const | |
{ | |
return size_list.back(); | |
} | |
inline bool compress(const double& percentage) | |
{ | |
if ((0.0 >= percentage) || (percentage >= 100.0)) return false; | |
std::size_t original_table_size = size_list.back(); | |
std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0)))); | |
new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); | |
if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) return false; | |
desired_false_positive_probability_ = effective_fpp(); | |
cell_type* tmp = new cell_type[new_table_size / bits_per_char]; | |
std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); | |
cell_type* it = bit_table_ + (new_table_size / bits_per_char); | |
cell_type* end = bit_table_ + (original_table_size / bits_per_char); | |
cell_type* it_tmp = tmp; | |
while(it != end) { *(it_tmp++) |= (*it++); } | |
delete[] bit_table_; | |
bit_table_ = tmp; | |
size_list.push_back(new_table_size); | |
return true; | |
} | |
private: | |
inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const | |
{ | |
bit_index = hash; | |
for(unsigned int j = 0; j < size_list.size(); bit_index %= size_list[j++]) ; | |
bit = bit_index % bits_per_char; | |
} | |
std::vector<std::size_t> size_list; | |
}; | |
#endif | |
/* | |
Note 1: | |
If it can be guaranteed that bits_per_char will be of the form 2^n then | |
the following optimization can be used: | |
hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; | |
Note 2: | |
For performance reasons where possible when allocating memory it should | |
be aligned (aligned_alloc) according to the architecture being used. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <iostream> | |
#include <algorithm> | |
#include <cstdlib> | |
#include "bloom_filter.hpp" | |
using namespace std; | |
string words("abcdefghijklmnopqrstuvwxyz" | |
"0123456789"); | |
const int report_every(100000); | |
const int max_permutations(1000); | |
int main(int argc, char **argv) { | |
if (argc < 2) { | |
cerr << "Need to tell me how many you want." << endl; | |
exit(64); // EX_USAGE | |
} | |
srand(718472382); // Now with determinism. | |
int todo(atoi(argv[1])); | |
bloom_filter filter(todo, 0.0001, time(NULL)); | |
int dups(0); | |
int since(0); | |
for (int done = 0; done < todo;) { | |
int length = (rand() % 6) + 6; | |
random_shuffle(words.begin(), words.end()); | |
string word(words.substr(0, length)); | |
int permutations = rand() % max_permutations; | |
for (int j = 0; j < permutations && done < todo; ++j) { | |
if (!next_permutation(word.begin(), word.end())) { | |
break; | |
} | |
if (filter.contains(word)) { | |
++dups; | |
} else { | |
++done; | |
filter.insert(word); | |
cout << word << "\n"; | |
} | |
if (done % report_every == 0 && dups > 0) { | |
cerr << dups << " dups in the last " << since << endl; | |
dups = 0; | |
since = 0; | |
} | |
++since; | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CFLAGS=--std=c99 -O6 -funroll-loops | |
CXXFLAGS=-O6 | |
TODO=100000000 | |
usernames.txt: gen_usernames | |
./gen_usernames $(TODO) > usernames.txt.tmp | |
mv usernames.txt.tmp usernames.txt | |
gen_usernames: gen_usernames.o | |
$(CXX) -o $@ gen_usernames.o |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment