Skip to content

Instantly share code, notes, and snippets.

@Mytherin
Last active October 1, 2020 07:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mytherin/0c494ab3c3aee490e7d1be66a228b87c to your computer and use it in GitHub Desktop.
Save Mytherin/0c494ab3c3aee490e7d1be66a228b87c to your computer and use it in GitHub Desktop.
AVX512 Bitmask vs scalar selection vector
#include <chrono>
#include <iostream>
#include <stdlib.h>
#include <bitset>
#include <vector>
// #include "x86/avx.h"
// #include "x86/avx512f.h"
// #include "x86/avx512h.h"
#include <immintrin.h>
/*
INT32
clang++ -std=c++11 test.cpp -O3 -mavx512f && ./a.out
Bitmask (no SIMD) (Selectivity 100): 649ms (Result: 51201370174)
Bitmask (no SIMD) (Selectivity 99): 673ms (Result: 50694880171)
Bitmask (no SIMD) (Selectivity 95): 662ms (Result: 48667251986)
Bitmask (no SIMD) (Selectivity 75): 648ms (Result: 38528036114)
Bitmask (no SIMD) (Selectivity 50): 646ms (Result: 25854797349)
Bitmask (no SIMD) (Selectivity 25): 647ms (Result: 13180588168)
Bitmask (no SIMD) (Selectivity 5): 650ms (Result: 3041189445)
Bitmask (no SIMD) (Selectivity 1): 647ms (Result: 1013846053)
Bitmask (no SIMD) (Selectivity -1): 315ms (Result: 0)
Bitmask (SIMD) (Selectivity 100): 482ms (Result: 51201370174)
Bitmask (SIMD) (Selectivity 99): 482ms (Result: 50694880171)
Bitmask (SIMD) (Selectivity 95): 482ms (Result: 48667251986)
Bitmask (SIMD) (Selectivity 75): 482ms (Result: 38528036114)
Bitmask (SIMD) (Selectivity 50): 481ms (Result: 25854797349)
Bitmask (SIMD) (Selectivity 25): 482ms (Result: 13180588168)
Bitmask (SIMD) (Selectivity 5): 482ms (Result: 3041189445)
Bitmask (SIMD) (Selectivity 1): 482ms (Result: 1013846053)
Bitmask (SIMD) (Selectivity -1): 482ms (Result: 0)
SelVector (Selectivity 100): 625ms (Result: 51201370174)
SelVector (Selectivity 99): 632ms (Result: 50694880171)
SelVector (Selectivity 95): 631ms (Result: 48667251986)
SelVector (Selectivity 75): 623ms (Result: 38528036114)
SelVector (Selectivity 50): 613ms (Result: 25854797349)
SelVector (Selectivity 25): 601ms (Result: 13180588168)
SelVector (Selectivity 5): 549ms (Result: 3041189445)
SelVector (Selectivity 1): 466ms (Result: 1013846053)
SelVector (Selectivity -1): 283ms (Result: 0)
INT64 (See https://gist.github.com/Mytherin/b5c0dc2ca20ca6754255bb3abfbfedbc)
Bitmask (no SIMD) (Selectivity 100): 1088ms (Result: 51201370174)
Bitmask (no SIMD) (Selectivity 99): 1122ms (Result: 50694880171)
Bitmask (no SIMD) (Selectivity 95): 1158ms (Result: 48667251986)
Bitmask (no SIMD) (Selectivity 75): 1138ms (Result: 38528036114)
Bitmask (no SIMD) (Selectivity 50): 1134ms (Result: 25854797349)
Bitmask (no SIMD) (Selectivity 25): 1137ms (Result: 13180588168)
Bitmask (no SIMD) (Selectivity 5): 1125ms (Result: 3041189445)
Bitmask (no SIMD) (Selectivity 1): 1081ms (Result: 1013846053)
Bitmask (no SIMD) (Selectivity -1): 483ms (Result: 0)
Bitmask (SIMD) (Selectivity 100): 925ms (Result: 51201370174)
Bitmask (SIMD) (Selectivity 99): 926ms (Result: 50694880171)
Bitmask (SIMD) (Selectivity 95): 927ms (Result: 48667251986)
Bitmask (SIMD) (Selectivity 75): 928ms (Result: 38528036114)
Bitmask (SIMD) (Selectivity 50): 928ms (Result: 25854797349)
Bitmask (SIMD) (Selectivity 25): 928ms (Result: 13180588168)
Bitmask (SIMD) (Selectivity 5): 928ms (Result: 3041189445)
Bitmask (SIMD) (Selectivity 1): 928ms (Result: 1013846053)
Bitmask (SIMD) (Selectivity -1): 927ms (Result: 0)
SelVector (Selectivity 100): 925ms (Result: 51201370174)
SelVector (Selectivity 99): 925ms (Result: 50694880171)
SelVector (Selectivity 95): 928ms (Result: 48667251986)
SelVector (Selectivity 75): 922ms (Result: 38528036114)
SelVector (Selectivity 50): 914ms (Result: 25854797349)
SelVector (Selectivity 25): 895ms (Result: 13180588168)
SelVector (Selectivity 5): 733ms (Result: 3041189445)
SelVector (Selectivity 1): 545ms (Result: 1013846053)
SelVector (Selectivity -1): 327ms (Result: 0)
INT32
clang++ -std=c++11 test.cpp -O3 -DDISABLE_AVX512 && ./a.out
Bitmask (no SIMD) (Selectivity 100): 631ms (Result: 51201370174)
Bitmask (no SIMD) (Selectivity 99): 650ms (Result: 50694880171)
Bitmask (no SIMD) (Selectivity 95): 636ms (Result: 48667251986)
Bitmask (no SIMD) (Selectivity 75): 603ms (Result: 38528036114)
Bitmask (no SIMD) (Selectivity 50): 579ms (Result: 25854797349)
Bitmask (no SIMD) (Selectivity 25): 558ms (Result: 13180588168)
Bitmask (no SIMD) (Selectivity 5): 547ms (Result: 3041189445)
Bitmask (no SIMD) (Selectivity 1): 542ms (Result: 1013846053)
Bitmask (no SIMD) (Selectivity -1): 243ms (Result: 0)
SelVector (Selectivity 100): 678ms (Result: 51201370174)
SelVector (Selectivity 99): 679ms (Result: 50694880171)
SelVector (Selectivity 95): 672ms (Result: 48667251986)
SelVector (Selectivity 75): 642ms (Result: 38528036114)
SelVector (Selectivity 50): 609ms (Result: 25854797349)
SelVector (Selectivity 25): 580ms (Result: 13180588168)
SelVector (Selectivity 5): 516ms (Result: 3041189445)
SelVector (Selectivity 1): 414ms (Result: 1013846053)
SelVector (Selectivity -1): 258ms (Result: 0)
INT64
clang++ -std=c++11 test-64.cpp -O3 -DDISABLE_AVX512 && ./a.out
Bitmask (no SIMD) (Selectivity 100): 935ms (Result: 51201370174)
Bitmask (no SIMD) (Selectivity 99): 977ms (Result: 50694880171)
Bitmask (no SIMD) (Selectivity 95): 999ms (Result: 48667251986)
Bitmask (no SIMD) (Selectivity 75): 989ms (Result: 38528036114)
Bitmask (no SIMD) (Selectivity 50): 979ms (Result: 25854797349)
Bitmask (no SIMD) (Selectivity 25): 974ms (Result: 13180588168)
Bitmask (no SIMD) (Selectivity 5): 964ms (Result: 3041189445)
Bitmask (no SIMD) (Selectivity 1): 923ms (Result: 1013846053)
Bitmask (no SIMD) (Selectivity -1): 359ms (Result: 0)
SelVector (Selectivity 100): 968ms (Result: 51201370174)
SelVector (Selectivity 99): 974ms (Result: 50694880171)
SelVector (Selectivity 95): 973ms (Result: 48667251986)
SelVector (Selectivity 75): 960ms (Result: 38528036114)
SelVector (Selectivity 50): 940ms (Result: 25854797349)
SelVector (Selectivity 25): 912ms (Result: 13180588168)
SelVector (Selectivity 5): 747ms (Result: 3041189445)
SelVector (Selectivity 1): 603ms (Result: 1013846053)
SelVector (Selectivity -1): 340ms (Result: 0)
*/
// https://github.com/simd-everywhere/simde/releases/tag/v0.6.0
// clang++ -std=c++11 test.cpp -O3 -mavx512f && ./a.out
// clang++ -std=c++11 test.cpp -O3 -DDISABLE_AVX512 && ./a.out
using namespace std;
#define VECTOR_SIZE 1024
// 1024 / 64
#define MASK_COUNT 16
#define DATA_SIZE 512000000
struct nullmask_t {
union {
uint64_t mask_u64[VECTOR_SIZE / 64];
uint32_t mask_u32[VECTOR_SIZE / 32];
uint16_t mask_u16[VECTOR_SIZE / 16];
uint8_t mask_u8[VECTOR_SIZE / 8];
} mask_union;
};
size_t sel_vector_lt(int32_t * a, int32_t constant, size_t count, uint16_t *result_sel) {
size_t result_count = 0;
for(size_t i = 0; i < count; i++) {
result_sel[result_count] = i;
result_count += a[i] <= constant;
}
return result_count;
}
void sel_vector_addition(const int32_t * a , const int32_t * b , size_t count, const uint16_t *sel, int32_t * result ) {
for(size_t i = 0; i < count; i++) {
result[i] = a[sel[i]] + b[sel[i]];
}
}
void sel_vector_summation(const int32_t * a , size_t count, int64_t &sum) {
for(size_t i = 0; i < count; i++) {
sum += a[i];
}
}
void selection_vector_benchmark(int32_t *a, int32_t *b, int32_t *c, int32_t constant) {
uint16_t sel_vector[VECTOR_SIZE];
int32_t intermediate[VECTOR_SIZE];
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) {
size_t result_count = sel_vector_lt(c + i, constant, VECTOR_SIZE, sel_vector);
sel_vector_addition(a + i, b + i, result_count, sel_vector, intermediate);
sel_vector_summation(intermediate, result_count, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "SelVector (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n";
}
int32_t *aligned_alloc(int32_t count) {
auto ptr = new uint8_t[count * sizeof(int32_t) + 63];
return (int32_t*)(((uintptr_t(ptr) + 63) / 64) * 64);
}
#ifndef DISABLE_AVX512
void bitmask_lt(const int32_t *a , const __m512i constant , size_t count, nullmask_t &sel) {
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a + i * 16);
sel.mask_union.mask_u16[i] = _mm512_cmple_epi32_mask(a_mm512, constant);
}
}
void bitmask_addition(const int32_t *a , const int32_t *b , size_t count, const nullmask_t &sel, int32_t *result ) {
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a + i * 16);
__m512i b_mm512 = _mm512_load_epi32(b + i * 16);
auto mask = sel.mask_union.mask_u16[i];
auto result_mm512 = _mm512_maskz_add_epi32(mask, a_mm512, b_mm512);
_mm512_store_epi32(result + i * 16, result_mm512);
}
}
void bitmask_summation(const int32_t *a , const nullmask_t &sel, size_t count, int64_t &sum) {
for(size_t i = 0, target = count / 16; i < target; i++) {
__m512i a_mm512 = _mm512_load_epi32(a + i * 16);
sum += _mm512_mask_reduce_add_epi32(sel.mask_union.mask_u16[i], a_mm512);
}
}
void bitmask_benchmark(int32_t *a, int32_t *b, int32_t *c, int32_t constant) {
nullmask_t selection_mask;
int32_t *constant_i32 = aligned_alloc(VECTOR_SIZE);
for(int i = 0; i < VECTOR_SIZE; i++) {
constant_i32[i] = constant;
}
__m512i constant_mm512 = _mm512_load_epi32(constant_i32);
int32_t *intermediate = aligned_alloc(VECTOR_SIZE);
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) {
bitmask_lt((c + i), constant_mm512, VECTOR_SIZE, selection_mask);
bitmask_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate);
bitmask_summation(intermediate, selection_mask, VECTOR_SIZE, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Bitmask (SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n";
}
#endif
void bitmask_nosimd_lt(const int32_t *a , const int32_t constant , size_t count, nullmask_t &sel) {
for(size_t i = 0, target = count / 32; i < target; i++) {
sel.mask_union.mask_u32[i] = 0;
for(size_t j = 0; j < 32; j++) {
sel.mask_union.mask_u32[i] |= (a[i * 32 + j] <= constant) << j;
}
}
}
size_t bitmask_nosimd_addition(const int32_t *a , const int32_t *b , size_t count, const nullmask_t &sel, int32_t *result) {
size_t result_count = 0;
for(size_t i = 0, target = count / 32; i < target; i++) {
auto mask = sel.mask_union.mask_u32[i];
if (mask == 0) {
// no matches
continue;
} else if (mask == UINT32_C(0xFFFFFFFF)) {
// everything matches
for(size_t j = 0; j < 32; j++) {
auto index = i * 32 + j;
result[result_count++] = a[index] + b[index];
}
} else {
// partial matches
for(size_t j = 0; j < 32; j++) {
auto index = i * 32 + j;
result[result_count] = a[index] + b[index];
result_count += bool(mask & (1 << j));
}
}
}
return result_count;
}
void bitmask_benchmark_nosimd(int32_t *a, int32_t *b, int32_t *c, int32_t constant) {
nullmask_t selection_mask;
int32_t *intermediate = aligned_alloc(VECTOR_SIZE);
int64_t sum = 0;
auto start = chrono::system_clock::now();
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) {
bitmask_nosimd_lt((c + i), constant, VECTOR_SIZE, selection_mask);
size_t result_count = bitmask_nosimd_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate);
sel_vector_summation(intermediate, result_count, sum);
}
auto end = chrono::system_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Bitmask (no SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n";
}
int main() {
int32_t *a = aligned_alloc(DATA_SIZE);
int32_t *b = aligned_alloc(DATA_SIZE);
int32_t *c = aligned_alloc(DATA_SIZE);
for(size_t i = 0; i < DATA_SIZE; i++) {
a[i] = rand() % 101;
b[i] = rand() % 101;
c[i] = rand() % 101;
}
// SELECT SUM(a+b) FROM tbl WHERE c <= CONSTANT
// CONSTANT determines selectivity (0-100%)
vector<int> selectivities { 100, 99, 95, 75, 50, 25, 5, 1, -1};
for(auto selectivity : selectivities) {
bitmask_benchmark_nosimd(a, b, c, selectivity);
}
#ifndef DISABLE_AVX512
for(auto selectivity : selectivities) {
bitmask_benchmark(a, b, c, selectivity);
}
#endif
for(auto selectivity : selectivities) {
selection_vector_benchmark(a, b, c, selectivity);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment