Skip to content

Instantly share code, notes, and snippets.

@danielealbano
Last active June 1, 2020 19:39
Show Gist options
  • Save danielealbano/9fcbc1ff0a42cc9ad61be205366bdb5f to your computer and use it in GitHub Desktop.
Save danielealbano/9fcbc1ff0a42cc9ad61be205366bdb5f to your computer and use it in GitHub Desktop.
hash32_find_14 - simple loop vs simd (avx2, avx2 branchless) - google benchmark
/**
* For reference check this stackoverflow thread
* https://stackoverflow.com/questions/62120797/find-element-index-with-avx2-code-optimization
**/
#include <stdint.h>
#include <immintrin.h>
#include <benchmark/benchmark.h>
#define CPU_CORE_LOGICAL_COUNT 12
void set_thread_affinity(int thread_index) {
int res;
cpu_set_t cpuset;
pthread_t thread;
CPU_ZERO(&cpuset);
CPU_SET(thread_index % CPU_CORE_LOGICAL_COUNT, &cpuset);
thread = pthread_self();
res = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
if (res != 0) {
perror("pthread_setaffinity_np");
}
}
uint32_t* init_hashes() {
uint32_t* __attribute__((aligned(16))) hashes =
(uint32_t*)aligned_alloc(16, sizeof(uint32_t) * 16);
for(uint32_t i = 0; i < 16; i++) {
uint32_t v = i;
hashes[i] = v;
}
return hashes;
}
#if 0
// Can't test, I relied on godbolt to implement it https://godbolt.org/z/8fgAt7
int8_t hash32_find_14_avx512(uint32_t hash, volatile uint32_t* hashes) {
__mmask16 compacted_result_mask;
__m512i cmp_vector, ring_vector;
cmp_vector = _mm512_set1_epi32(hash);
ring_vector = _mm512_stream_load_si512((__m512i*)hashes);
compacted_result_mask = _mm512_cmpeq_epi32_mask(ring_vector, cmp_vector);
return _tzcnt_u32(compacted_result_mask);
}
#endif
/**
* Implemented by @chtz
*
* https://stackoverflow.com/a/62123631/169278
**/
int8_t hash32_find_14_avx2_branchless(uint32_t hash, volatile uint32_t* hashes) {
uint32_t compacted_result_mask = 0;
__m256i cmp_vector = _mm256_set1_epi32(hash);
for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
compacted_result_mask |= _mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
}
return _tzcnt_u32(compacted_result_mask);
}
int8_t hash32_find_14_avx2(uint32_t hash, volatile uint32_t* hashes) {
uint32_t compacted_result_mask, leading_zeroes;
__m256i cmp_vector, ring_vector, result_mask_vector;
int8_t found_index = -1;
if (hashes[0] == hash) {
return 0;
}
cmp_vector = _mm256_set1_epi32(hash);
for(uint8_t base_index = 0; base_index < 14; base_index += 8) {
ring_vector = _mm256_stream_load_si256((__m256i*) (hashes + base_index));
result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
compacted_result_mask = _mm256_movemask_epi8(result_mask_vector);
if (compacted_result_mask != 0) {
leading_zeroes = 32 - __builtin_clz(compacted_result_mask);
found_index = base_index + (leading_zeroes >> 2u) - 1;
break;
}
}
return found_index > 13 ? 32 : found_index;
}
int8_t hash32_find_14_loop(uint32_t hash, volatile uint32_t* hashes) {
for(uint8_t index = 0; index < 14; index++) {
if (hashes[index] == hash) {
return index;
}
}
return 32;
}
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, SUFFIX, ...) \
void bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX(benchmark::State& state) { \
uint32_t* hashes = NULL; \
hashtable_support_hash_search_fp_t hashtable_support_hash_search_method = hash32_find_14_##METHOD; \
\
set_thread_affinity(state.thread_index); \
\
for (auto _ : state) { \
hashes = init_hashes(); \
__VA_ARGS__ \
free(hashes); \
} \
} \
BENCHMARK(bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX) \
->Iterations(10000000) \
->Threads(1)->Threads(2)->Threads(4)->Threads(8)->Threads(16)->Threads(32)->Threads(64);
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD) \
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, full, { \
uint32_t hashes[] = { 8, 1, 13, 4, 9, 0, 5, 11, 3, 12, 7, 2, 15, 14, 6, 10 }; \
for(uint8_t i = 0; i < sizeof(hashes) / sizeof(uint32_t); i++) { \
benchmark::DoNotOptimize(hashtable_support_hash_search_method(hashes[i], hashes)); \
} \
})
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(METHOD) \
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD);
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(loop);
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2);
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2_branchless);
BENCHMARK_MAIN();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment