Created
September 30, 2020 21:43
-
-
Save Mytherin/b5c0dc2ca20ca6754255bb3abfbfedbc to your computer and use it in GitHub Desktop.
Bitmask vs selection vector, int64
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> | |
#include <iostream> | |
#include <stdlib.h> | |
#include <bitset> | |
#include <vector> | |
// #include "x86/avx.h" | |
// #include "x86/avx512f.h" | |
// #include "x86/avx512h.h" | |
#include <immintrin.h> | |
// https://github.com/simd-everywhere/simde/releases/tag/v0.6.0 | |
// clang++ -std=c++11 test-64.cpp -O3 -mavx512f && ./a.out | |
// clang++ -std=c++11 test-64.cpp -O3 -DDISABLE_AVX512 && ./a.out | |
using namespace std; | |
#define VECTOR_SIZE 1024 | |
// 1024 / 64 | |
#define MASK_COUNT 16 | |
#define DATA_SIZE 512000000 | |
struct nullmask_t { | |
union { | |
uint64_t mask_u64[VECTOR_SIZE / 64]; | |
uint32_t mask_u32[VECTOR_SIZE / 32]; | |
uint16_t mask_u16[VECTOR_SIZE / 16]; | |
uint8_t mask_u8[VECTOR_SIZE / 8]; | |
} mask_union; | |
}; | |
size_t sel_vector_lt(int64_t * a, int64_t constant, size_t count, uint16_t *result_sel) { | |
size_t result_count = 0; | |
for(size_t i = 0; i < count; i++) { | |
result_sel[result_count] = i; | |
result_count += a[i] <= constant; | |
} | |
return result_count; | |
} | |
void sel_vector_addition(const int64_t * a , const int64_t * b , size_t count, const uint16_t *sel, int64_t * result ) { | |
for(size_t i = 0; i < count; i++) { | |
result[i] = a[sel[i]] + b[sel[i]]; | |
} | |
} | |
void sel_vector_summation(const int64_t * a , size_t count, int64_t &sum) { | |
for(size_t i = 0; i < count; i++) { | |
sum += a[i]; | |
} | |
} | |
void selection_vector_benchmark(int64_t *a, int64_t *b, int64_t *c, int32_t constant) { | |
uint16_t sel_vector[VECTOR_SIZE]; | |
int64_t intermediate[VECTOR_SIZE]; | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
size_t result_count = sel_vector_lt(c + i, constant, VECTOR_SIZE, sel_vector); | |
sel_vector_addition(a + i, b + i, result_count, sel_vector, intermediate); | |
sel_vector_summation(intermediate, result_count, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "SelVector (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
int64_t *aligned_alloc(int64_t count) { | |
auto ptr = new uint8_t[count * sizeof(int64_t) + 63]; | |
return (int64_t*)(((uintptr_t(ptr) + 63) / 64) * 64); | |
} | |
#ifndef DISABLE_AVX512 | |
void bitmask_lt(const int64_t *a , const __m512i constant , size_t count, nullmask_t &sel) { | |
for(size_t i = 0, target = count / 8; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi64(a + i * 8); | |
sel.mask_union.mask_u8[i] = _mm512_cmple_epi64_mask(a_mm512, constant); | |
} | |
} | |
void bitmask_addition(const int64_t *a , const int64_t *b , size_t count, const nullmask_t &sel, int64_t *result ) { | |
for(size_t i = 0, target = count / 8; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi64(a + i * 8); | |
__m512i b_mm512 = _mm512_load_epi64(b + i * 8); | |
auto mask = sel.mask_union.mask_u8[i]; | |
auto result_mm512 = _mm512_maskz_add_epi64(mask, a_mm512, b_mm512); | |
_mm512_store_epi64(result + i * 8, result_mm512); | |
} | |
} | |
void bitmask_summation(const int64_t *a , const nullmask_t &sel, size_t count, int64_t &sum) { | |
for(size_t i = 0, target = count / 8; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi64(a + i * 8); | |
sum += _mm512_mask_reduce_add_epi64(sel.mask_union.mask_u8[i], a_mm512); | |
} | |
} | |
void bitmask_benchmark(int64_t *a, int64_t *b, int64_t *c, int64_t constant) { | |
nullmask_t selection_mask; | |
int64_t *constant_i32 = aligned_alloc(VECTOR_SIZE); | |
for(int i = 0; i < VECTOR_SIZE; i++) { | |
constant_i32[i] = constant; | |
} | |
__m512i constant_mm512 = _mm512_load_epi32(constant_i32); | |
int64_t *intermediate = aligned_alloc(VECTOR_SIZE); | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
bitmask_lt((c + i), constant_mm512, VECTOR_SIZE, selection_mask); | |
bitmask_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate); | |
bitmask_summation(intermediate, selection_mask, VECTOR_SIZE, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "Bitmask (SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
#endif | |
void bitmask_nosimd_lt(const int64_t *a , const int64_t constant , size_t count, nullmask_t &sel) { | |
for(size_t i = 0, target = count / 32; i < target; i++) { | |
sel.mask_union.mask_u32[i] = 0; | |
for(size_t j = 0; j < 32; j++) { | |
sel.mask_union.mask_u32[i] |= (a[i * 32 + j] <= constant) << j; | |
} | |
} | |
} | |
size_t bitmask_nosimd_addition(const int64_t *a , const int64_t *b , size_t count, const nullmask_t &sel, int64_t *result) { | |
size_t result_count = 0; | |
for(size_t i = 0, target = count / 32; i < target; i++) { | |
auto mask = sel.mask_union.mask_u32[i]; | |
if (mask == 0) { | |
// no matches | |
continue; | |
} else if (mask == UINT32_C(0xFFFFFFFF)) { | |
// everything matches | |
for(size_t j = 0; j < 32; j++) { | |
auto index = i * 32 + j; | |
result[result_count++] = a[index] + b[index]; | |
} | |
} else { | |
// partial matches | |
for(size_t j = 0; j < 32; j++) { | |
auto index = i * 32 + j; | |
result[result_count] = a[index] + b[index]; | |
result_count += (mask & (1 << j)) >> j; | |
} | |
} | |
} | |
return result_count; | |
} | |
void bitmask_benchmark_nosimd(int64_t *a, int64_t *b, int64_t *c, int32_t constant) { | |
nullmask_t selection_mask; | |
int64_t *intermediate = aligned_alloc(VECTOR_SIZE); | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
bitmask_nosimd_lt((c + i), constant, VECTOR_SIZE, selection_mask); | |
size_t result_count = bitmask_nosimd_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate); | |
sel_vector_summation(intermediate, result_count, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "Bitmask (no SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
int main() { | |
int64_t *a = aligned_alloc(DATA_SIZE); | |
int64_t *b = aligned_alloc(DATA_SIZE); | |
int64_t *c = aligned_alloc(DATA_SIZE); | |
for(size_t i = 0; i < DATA_SIZE; i++) { | |
a[i] = rand() % 101; | |
b[i] = rand() % 101; | |
c[i] = rand() % 101; | |
} | |
// SELECT SUM(a+b) FROM tbl WHERE c <= CONSTANT | |
// CONSTANT determines selectivity (0-100%) | |
vector<int> selectivities { 100, 99, 95, 75, 50, 25, 5, 1, -1}; | |
for(auto selectivity : selectivities) { | |
bitmask_benchmark_nosimd(a, b, c, selectivity); | |
} | |
#ifndef DISABLE_AVX512 | |
for(auto selectivity : selectivities) { | |
bitmask_benchmark(a, b, c, selectivity); | |
} | |
#endif | |
for(auto selectivity : selectivities) { | |
selection_vector_benchmark(a, b, c, selectivity); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment