Skip to content

Instantly share code, notes, and snippets.

@Mytherin
Created October 1, 2020 11:14
Show Gist options
  • Save Mytherin/5c1b18ae62d7694f7c8be847dba3f6ef to your computer and use it in GitHub Desktop.
Save Mytherin/5c1b18ae62d7694f7c8be847dba3f6ef to your computer and use it in GitHub Desktop.
#include <chrono>
#include <iostream>
#include <stdlib.h>
#include <bitset>
#include <vector>
// #include "x86/avx.h"
// #include "x86/avx512f.h"
// #include "x86/avx512h.h"
#include <immintrin.h>
// clang++ -std=c++11 test-nulls.cpp -O3 -mavx512f && ./a.out
// clang++ -std=c++11 test-nulls.cpp -O3 -DDISABLE_AVX512 && ./a.out
/*
<< Null Percentage 1% >>
Selectivity 100%
SelVector 193ms
Bitmask (no SIMD) 117ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 197ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 220ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 305ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 385ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 282ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 169ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 125ms
Bitmask (no SIMD) 113ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 67ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
<< Null Percentage 10% >>
Selectivity 100%
SelVector 186ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 196ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 225ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 348ms
Bitmask (no SIMD) 117ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 469ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 326ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 180ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 128ms
Bitmask (no SIMD) 112ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 65ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
<< Null Percentage 25% >>
Selectivity 100%
SelVector 178ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 189ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 218ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 341ms
Bitmask (no SIMD) 117ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 465ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 324ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 176ms
Bitmask (no SIMD) 115ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 124ms
Bitmask (no SIMD) 108ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 67ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
<< Null Percentage 50% >>
Selectivity 100%
SelVector 165ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 176ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 206ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 331ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 458ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 320ms
Bitmask (no SIMD) 116ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 166ms
Bitmask (no SIMD) 108ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 118ms
Bitmask (no SIMD) 103ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 66ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
<< Null Percentage 75% >>
Selectivity 100%
SelVector 151ms
Bitmask (no SIMD) 112ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 161ms
Bitmask (no SIMD) 112ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 194ms
Bitmask (no SIMD) 113ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 320ms
Bitmask (no SIMD) 111ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 450ms
Bitmask (no SIMD) 108ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 313ms
Bitmask (no SIMD) 104ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 155ms
Bitmask (no SIMD) 100ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 109ms
Bitmask (no SIMD) 99ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 65ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
<< Null Percentage 99% >>
Selectivity 100%
SelVector 81ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 100): 112ms
Selectivity 99%
SelVector 86ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 99): 112ms
Selectivity 95%
SelVector 99ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 95): 112ms
Selectivity 75%
SelVector 153ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 75): 112ms
Selectivity 50%
SelVector 208ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 50): 112ms
Selectivity 25%
SelVector 145ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 25): 112ms
Selectivity 5%
SelVector 81ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 5): 112ms
Selectivity 1%
SelVector 66ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity 1): 112ms
Selectivity -1%
SelVector 52ms
Bitmask (no SIMD) 98ms
Bitmask (SIMD) (Selectivity -1): 112ms
*/
using namespace std;
#define VECTOR_SIZE 1024
#define DATA_SIZE 102400000
#define VECTOR_COUNT (DATA_SIZE / VECTOR_SIZE)
int32_t *aligned_alloc(int32_t count) {
auto ptr = new uint8_t[count * sizeof(int32_t) + 63];
return (int32_t*)(((uintptr_t(ptr) + 63) / 64) * 64);
}
struct nullmask_t {
union {
uint64_t mask_u64[VECTOR_SIZE / 64];
uint32_t mask_u32[VECTOR_SIZE / 32];
uint16_t mask_u16[VECTOR_SIZE / 16];
uint8_t mask_u8[VECTOR_SIZE / 8];
} mask_union;
void set_index(size_t i, bool is_null) {
uint64_t mask_index = i / 64;
uint64_t index_in_mask = i - mask_index * 64;
if (is_null) {
mask_union.mask_u64[mask_index] |= uint64_t(1) << index_in_mask;
} else {
mask_union.mask_u64[mask_index] &= ~(uint64_t(1) << index_in_mask);
}
}
bool get_index(size_t i) {
uint64_t mask_index = i / 64;
uint64_t index_in_mask = i - mask_index * 64;
return mask_union.mask_u64[mask_index] & (uint64_t(1) << index_in_mask);
}
static void or_nullmask(const nullmask_t &a, const nullmask_t &b, nullmask_t &result) {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
result.mask_union.mask_u64[i] = a.mask_union.mask_u64[i] | b.mask_union.mask_u64[i];
}
}
static void or_nullmask_inplace(const nullmask_t &a, nullmask_t &result) {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
result.mask_union.mask_u64[i] |= a.mask_union.mask_u64[i];
}
}
static void and_nullmask(const nullmask_t &a, const nullmask_t &b, nullmask_t &result) {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
result.mask_union.mask_u64[i] = a.mask_union.mask_u64[i] & b.mask_union.mask_u64[i];
}
}
static void and_nullmask_inplace(const nullmask_t &a, nullmask_t &result) {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
result.mask_union.mask_u64[i] &= a.mask_union.mask_u64[i];
}
}
void reset() {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
mask_union.mask_u64[i] = 0;
}
}
void set() {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
mask_union.mask_u64[i] = ~0;
}
}
void assign(const nullmask_t &other) {
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
mask_union.mask_u64[i] = other.mask_union.mask_u64[i];
}
}
};
struct Vector {
Vector(){}
Vector(int32_t *data, nullmask_t *mask) : data(data), mask(mask) {}
int32_t *data;
nullmask_t *mask;
};
size_t sel_vector_lt(Vector &a, int32_t constant, size_t count, uint16_t *result_sel) {
size_t result_count = 0;
for(uint64_t i = 0; i < count / 64; i++) {
auto mask = a.mask->mask_union.mask_u64[i];
if (mask == 0) {
// nothing valid: skip
continue;
} else if (mask == ~uint64_t(0)) {
// everything valid: no need to check mask
for(size_t j = 0; j < 64; j++) {
size_t index = i * 64 + j;
result_sel[result_count] = index;
result_count += a.data[index] <= constant;
}
} else {
// partially valid: check mask
for(uint64_t j = 0; j < 64; j++) {
uint64_t index = i * 64 + j;
result_sel[result_count] = index;
result_count += a.data[index] <= constant && (mask & (uint64_t(1) << j));
}
}
}
return result_count;
}
void sel_vector_addition(const Vector &a , const Vector &b , size_t count, const uint16_t *sel, Vector &result ) {
nullmask_t::and_nullmask(*a.mask, *b.mask, *result.mask);
for(size_t i = 0; i < count; i++) {
result.data[sel[i]] = a.data[sel[i]] + b.data[sel[i]];
}
}
void sel_vector_summation(const Vector &a , size_t count, const uint16_t *sel, int64_t &sum) {
for(size_t i = 0; i < count; i++) {
size_t index = sel[i];
sum += a.data[index] * a.mask->get_index(index);
}
}
int64_t selection_vector_benchmark(Vector *a, Vector *b, Vector *c, int32_t constant) {
uint16_t sel_vector[VECTOR_SIZE];
int32_t *intermediate_data = aligned_alloc(VECTOR_SIZE);
nullmask_t intermediate_mask;
Vector intermediate(intermediate_data, &intermediate_mask);
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < VECTOR_COUNT; i++) {
size_t result_count = sel_vector_lt(c[i], constant, VECTOR_SIZE, sel_vector);
sel_vector_addition(a[i], b[i], result_count, sel_vector, intermediate);
sel_vector_summation(intermediate, result_count, sel_vector, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "SelVector " << elapsed.count() << "ms\n";
return sum;
}
#ifndef DISABLE_AVX512
void bitmask_lt(const Vector &a , const __m512i constant , size_t count, nullmask_t &sel) {
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a.data + i * 16);
sel.mask_union.mask_u16[i] = _mm512_cmple_epi32_mask(a_mm512, constant);
}
nullmask_t::and_nullmask_inplace(*a.mask, sel);
}
void bitmask_addition(const Vector &a , const Vector &b , size_t count, const nullmask_t &sel, Vector &result) {
nullmask_t addition_mask;
nullmask_t::and_nullmask(*a.mask, *b.mask, *result.mask);
nullmask_t::and_nullmask(*result.mask, sel, addition_mask);
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a.data + i * 16);
__m512i b_mm512 = _mm512_load_epi32(b.data + i * 16);
auto mask = addition_mask.mask_union.mask_u16[i];
auto result_mm512 = _mm512_maskz_add_epi32(mask, a_mm512, b_mm512);
_mm512_store_epi32(result.data + i * 16, result_mm512);
}
}
void bitmask_summation(const Vector &a , const nullmask_t &sel, size_t count, int64_t &sum) {
nullmask_t sum_mask;
nullmask_t::and_nullmask(*a.mask, sel, sum_mask);
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a.data + i * 16);
sum += _mm512_mask_reduce_add_epi32(sum_mask.mask_union.mask_u16[i], a_mm512);
}
}
int64_t bitmask_benchmark(Vector *a, Vector *b, Vector *c, int32_t constant) {
nullmask_t selection_mask;
int32_t *constant_i32 = aligned_alloc(VECTOR_SIZE);
for(int i = 0; i < VECTOR_SIZE; i++) {
constant_i32[i] = constant;
}
__m512i constant_mm512 = _mm512_load_epi32(constant_i32);
int32_t *intermediate_data = aligned_alloc(VECTOR_SIZE);
nullmask_t intermediate_mask;
Vector intermediate(intermediate_data, &intermediate_mask);
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < VECTOR_COUNT; i++) {
bitmask_lt(c[i], constant_mm512, VECTOR_SIZE, selection_mask);
bitmask_addition(a[i], b[i], VECTOR_SIZE, selection_mask, intermediate);
bitmask_summation(intermediate, selection_mask, VECTOR_SIZE, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Bitmask (SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms\n";
return sum;
}
#endif
void bitmask_nosimd_lt(const Vector &a , const int32_t constant, nullmask_t &sel) {
for(size_t i = 0, target = VECTOR_SIZE / 64; i < target; i++) {
auto mask = a.mask->mask_union.mask_u64[i];
for(size_t j = 0; j < 64; j++) {
size_t index = i * 64 + j;
mask &= ~(uint64_t(!(a.data[index] <= constant)) << j);
}
sel.mask_union.mask_u64[i] = mask;
}
}
void bitmask_nosimd_addition(const Vector &a , const Vector &b, const nullmask_t &sel, Vector &result) {
nullmask_t::and_nullmask(*a.mask, *b.mask, *result.mask);
for(size_t i = 0; i < VECTOR_SIZE; i++) {
result.data[i] = a.data[i] + b.data[i];
}
}
void bitmask_nosimd_sum(const Vector &a, const nullmask_t &sel, int64_t &result) {
nullmask_t filter_mask;
nullmask_t::and_nullmask(sel, *a.mask, filter_mask);
for(size_t i = 0; i < VECTOR_SIZE / 64; i++) {
auto mask = filter_mask.mask_union.mask_u64[i];
if (mask == 0) {
// nothing valid: skip
continue;
} else if (mask == ~uint64_t(0)) {
// everything valid: no need to check mask
for(size_t j = 0; j < 64; j++) {
size_t index = i * 64 + j;
result += a.data[index];
}
} else {
// partially valid: check mask
for(size_t j = 0; j < 64; j++) {
size_t index = i * 64 + j;
result += a.data[index] * ((mask & (uint64_t(1) << j)) >> j);
}
}
}
}
int64_t bitmask_benchmark_nosimd(Vector *a, Vector *b, Vector *c, int32_t constant) {
nullmask_t selection_mask;
int32_t *intermediate_data = aligned_alloc(VECTOR_SIZE);
nullmask_t intermediate_mask;
Vector intermediate(intermediate_data, &intermediate_mask);
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < VECTOR_COUNT; i++) {
bitmask_nosimd_lt(c[i], constant, selection_mask);
bitmask_nosimd_addition(a[i], b[i], selection_mask, intermediate);
bitmask_nosimd_sum(intermediate, selection_mask, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Bitmask (no SIMD) " << elapsed.count() << "ms\n";
return sum;
}
int64_t hardcoded(Vector *a, Vector *b, Vector *c, int32_t constant) {
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < VECTOR_COUNT; i++) {
nullmask_t result_mask;
nullmask_t::and_nullmask(*a[i].mask, *b[i].mask, result_mask);
nullmask_t::and_nullmask_inplace(*c[i].mask, result_mask);
for(size_t j = 0; j < VECTOR_SIZE; j++) {
if (result_mask.get_index(j)) {
if (c[i].data[j] <= constant) {
sum += a[i].data[j] + b[i].data[j];
}
}
}
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Hardcoded " << elapsed.count() << "ms\n";
return sum;
}
int main() {
int32_t *a_data = aligned_alloc(DATA_SIZE);
int32_t *b_data = aligned_alloc(DATA_SIZE);
int32_t *c_data = aligned_alloc(DATA_SIZE);
nullmask_t *a_nullmask = new nullmask_t[VECTOR_COUNT];
nullmask_t *b_nullmask = new nullmask_t[VECTOR_COUNT];
nullmask_t *c_nullmask = new nullmask_t[VECTOR_COUNT];
Vector *a = new Vector[VECTOR_COUNT];
Vector *b = new Vector[VECTOR_COUNT];
Vector *c = new Vector[VECTOR_COUNT];
for(size_t i = 0; i < DATA_SIZE; i++) {
a_data[i] = rand() % 101;
b_data[i] = rand() % 101;
c_data[i] = rand() % 101;
}
for(size_t i = 0; i < VECTOR_COUNT; i++) {
a[i].data = a_data + i * VECTOR_SIZE;
b[i].data = b_data + i * VECTOR_SIZE;
c[i].data = c_data + i * VECTOR_SIZE;
a[i].mask = a_nullmask + i;
b[i].mask = b_nullmask + i;
c[i].mask = c_nullmask + i;
a[i].mask->set();
b[i].mask->set();
c[i].mask->set();
}
// SELECT SUM(a+b) FROM tbl WHERE c <= CONSTANT
// CONSTANT determines selectivity (0-100%)
vector<int> null_percentages {1, 10, 25, 50, 75, 99};
vector<int> selectivities { 100, 99, 95, 75, 50, 25, 5, 1, -1};
// vector<int> null_percentages {10};
// vector<int> selectivities { 95};
vector<string> names { "Hardcoded", "Selection Vector", "Bitmask (No SIMD)", "Bitmask"};
for(auto null_percentage : null_percentages) {
for(size_t i = 0; i < VECTOR_COUNT; i++) {
for(size_t j = 0; j < VECTOR_SIZE; j++) {
a[i].mask->set_index(j, (rand() % 101) > null_percentage);
b[i].mask->set_index(j, (rand() % 101) > null_percentage);
c[i].mask->set_index(j, (rand() % 101) > null_percentage);
}
}
std::cout << std::endl << "<< Null Percentage " << null_percentage << "% >>" << std::endl;
for(auto selectivity : selectivities) {
std::cout << std::endl << "Selectivity " << selectivity << "%" << std::endl;
vector<int64_t> results;
int64_t hardcoded_res = hardcoded(a, b, c, selectivity);
int64_t selvector_res = selection_vector_benchmark(a, b, c, selectivity);
int64_t bitmasknosimd_res = bitmask_benchmark_nosimd(a, b, c, selectivity);
results.push_back(hardcoded_res);
results.push_back(selvector_res);
results.push_back(bitmasknosimd_res);
#ifndef DISABLE_AVX512
int64_t bitmask_res = bitmask_benchmark(a, b, c, selectivity);
results.push_back(bitmask_res);
#endif
for(size_t j = 0; j < results.size(); j++) {
for(size_t k = 0; k < results.size(); k++) {
if (results[j] != results[k]) {
std::cout << "Discrepency in results between " << names[j] << " and " << names[k] << std::endl;
for(size_t l = 0; l < results.size(); l++) {
std::cout << names[l] << ": " << results[l] << std::endl;
}
exit(1);
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment