Skip to content

Instantly share code, notes, and snippets.

@lava
Created April 3, 2020 08:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lava/8c96fa32363f29f4534d51273197222e to your computer and use it in GitHub Desktop.
Save lava/8c96fa32363f29f4534d51273197222e to your computer and use it in GitHub Desktop.
AMQ Data Structures Benchmark
#include <assert.h> // needed by vacuum.h
#include <atomic>
#include <cuckoofilter.h>
#include <xorfilter.h>
#include <iostream>
#include <morton_filter.h>
#include <ostream>
#include <random>
#include <string>
#include <thread>
#include <unordered_set>
#include <vacuum.h>
#include "measuring.h"
#include <bits/stdint-uintn.h>
#include <vast/bloom_filter.hpp>
#include <vast/concept/hashable/xxhash.hpp>
#include <vast/concept/hashable/hash_append.hpp>
// All the filters expect uint64 as default input, so we don't bother
// generating and hashing strings for this benchmark.
std::vector<uint64_t> generate_unique_input(size_t N) {
static thread_local std::mt19937* rng;
if (!rng) {
size_t seed = 43;
rng = new std::mt19937(seed);
}
static std::uniform_int_distribution<uint64_t> dist;
std::unordered_set<uint64_t> seen;
std::vector<uint64_t> result(N);
for (int i = 0; i < N; ++i) {
uint64_t x;
do {
x = dist(*rng);
result[i] = x;
} while (seen.count(x));
seen.insert(x);
}
return result;
}
struct XorFacade;
// The third argument gives the proportion of filter elements in the input,
// i.e. step == 3 => every third element matches => 33% true positive rate
template <typename Filter>
void benchmark_filter(const std::vector<uint64_t>& true_input,
const std::vector<uint64_t>& false_input,
double hit_percentage) {
size_t S = Filter::ARGUMENT_442_MIB;
std::atomic<size_t> lookups{0};
std::atomic<size_t> true_positives{0};
std::atomic<size_t> false_positives{0};
std::atomic<bool> done{false};
Filter filter(S);
if constexpr (std::is_same_v<Filter, XorFacade>) {
filter.insert_many(true_input);
} else {
for (auto x : true_input) {
filter.insert(x);
}
}
std::thread measurement(
tenzir::benchmark::measure_for_n_seconds, 5, &done,
std::map<std::string, std::atomic<size_t>*>{
{"lookups", &lookups},
{"false_positives", &false_positives},
{"true_positives", &true_positives}},
[](const std::map<std::string, size_t>& old_values,
const std::map<std::string, size_t>& new_values) {
if (new_values.empty())
return;
auto lookups_delta = new_values.at("lookups") - old_values.at("lookups");
auto tp_delta
= new_values.at("true_positives") - old_values.at("true_positives");
auto fp_delta
= new_values.at("false_positives") - old_values.at("false_positives");
// We don't include true positives in the error rate calculation, because
// they represent unavoidable work.
double error_rate = 1. * fp_delta / (lookups_delta - tp_delta);
std::cout << "epsilon: " << error_rate << "\n"
<< "false positives: " << fp_delta << "\n"
<< "lookups: " << lookups_delta << "\n";
});
size_t true_proportion = 100. * hit_percentage;
size_t false_proportion = 100 - true_proportion;
size_t false_negatives = 0;
size_t true_idx = 0, false_idx = 0;
while (!done.load(std::memory_order_relaxed)) {
for (int i=0; i<true_proportion; ++i) {
auto x = true_input[true_idx++ % true_input.size()];
if (filter.lookup(x)) {
++true_positives;
} else {
++false_negatives;
}
++lookups;
}
for (int i=0; i<false_proportion; ++i) {
auto x = false_input[false_idx++ % false_input.size()];
if (filter.lookup(x)) {
++false_positives;
}
++lookups;
}
}
if (false_negatives > 0) {
std::cerr << "FALSE NEGATIVES DETECTED: " << false_negatives << "\n";
}
measurement.join();
}
// All three filters have effectively the same interface, except that the
// functions for insertion/lookup have different names. To avoid using macros,
// we define these facade classes and hope the compiler will optimize them away.
using MortonFilter = CompressedCuckoo::Morton3_8;
struct MortonFacade : private MortonFilter {
constexpr static size_t ARGUMENT_442_MIB = 33*10'000'000;
MortonFacade(size_t S) : MortonFilter(S) {
}
using MortonFilter::insert;
bool lookup(uint64_t x) {
return MortonFilter::likely_contains(x);
}
};
using MortonFilter16 = CompressedCuckoo::Morton7_16;
struct Morton716Facade : private MortonFilter16 {
constexpr static size_t ARGUMENT_442_MIB = 20*10'000'000;
Morton716Facade(size_t S) : MortonFilter16(S) {
}
using MortonFilter16::insert;
bool lookup(uint64_t x) {
return MortonFilter16::likely_contains(x);
}
};
using CuckooFilter = cuckoofilter::CuckooFilter<uint64_t, 12>;
struct CuckooFacade : private CuckooFilter {
constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000;
CuckooFacade(size_t S) : CuckooFilter(S){};
void insert(uint64_t x) {
CuckooFilter::Add(x);
}
bool lookup(uint64_t x) {
return CuckooFilter::Contain(x) == cuckoofilter::Ok;
}
};
using CuckooFilter16 = cuckoofilter::CuckooFilter<uint64_t, 16>;
struct CuckooFacade16 : private CuckooFilter16 {
constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000;
CuckooFacade16(size_t S) : CuckooFilter16(S){};
void insert(uint64_t x) {
if (CuckooFilter16::Add(x) != cuckoofilter::Ok)
std::cerr << "ERROR INSERTING\n";
}
bool lookup(uint64_t x) {
return CuckooFilter16::Contain(x) == cuckoofilter::Ok;
}
};
struct XorFacade {
constexpr static size_t ARGUMENT_442_MIB = 21*10'000'000;
XorFacade(size_t S) { xor16_allocate(S, &xf); };
~XorFacade() { xor16_free(&xf); }
// Single-item insert is *really* slow for xor filters
void insert_many(const std::vector<uint64_t>& x) {
xor16_populate(x.data(), x.size(), &xf);
}
bool lookup(uint64_t x) {
return xor16_contain(x, &xf);
}
xor16_t xf;
};
using VacuumFilter_ = VacuumFilter<uint16_t, 16>;
struct VacuumFacade : private VacuumFilter_ {
constexpr static size_t ARGUMENT_442_MIB = 24*10'000'000;
VacuumFacade(size_t S) { VacuumFilter_::init(S, 4, 400); }; // max_item_numbers, slots per bucket, max_kick_steps
using VacuumFilter_::insert;
using VacuumFilter_::lookup;
};
using BloomFilter = vast::bloom_filter<vast::xxhash>;
struct BloomFacade : private BloomFilter {
constexpr static size_t ARGUMENT_442_MIB = 8*46*10'000'000ull;
BloomFacade(size_t S): BloomFilter(S) {}
using BloomFilter::lookup;
void insert(uint64_t x) {
BloomFilter::add(x);
}
};
int main() {
for (int i=0; i<6; ++i) {
int N = 500'000 + i*10'000'000;
std::cout << "# Generating input... " << std::endl;
auto true_input = generate_unique_input(N);
auto false_input = generate_unique_input(4096ull*4096ull);
double hit_percentage = 0.1;
std::cout << "# Measuring Vacuum Filter\n";
benchmark_filter<VacuumFacade>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Cuckoo Filter\n";
benchmark_filter<CuckooFacade>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Morton Filter\n";
benchmark_filter<MortonFacade>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Xor Filter\n";
benchmark_filter<XorFacade>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Bloom Filter\n";
benchmark_filter<BloomFacade>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Cuckoo16 Filter\n";
benchmark_filter<CuckooFacade16>(true_input, false_input, hit_percentage);
std::cout << "# Measuring Morton16 Filter\n";
benchmark_filter<Morton716Facade>(true_input, false_input, hit_percentage);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment