Last active
June 1, 2020 19:39
-
-
Save danielealbano/9fcbc1ff0a42cc9ad61be205366bdb5f to your computer and use it in GitHub Desktop.
hash32_find_14 - simple loop vs simd (avx2, avx2 branchless) - google benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* For reference check this stackoverflow thread | |
* https://stackoverflow.com/questions/62120797/find-element-index-with-avx2-code-optimization | |
**/ | |
#include <stdint.h> | |
#include <immintrin.h> | |
#include <benchmark/benchmark.h> | |
#define CPU_CORE_LOGICAL_COUNT 12 | |
void set_thread_affinity(int thread_index) { | |
int res; | |
cpu_set_t cpuset; | |
pthread_t thread; | |
CPU_ZERO(&cpuset); | |
CPU_SET(thread_index % CPU_CORE_LOGICAL_COUNT, &cpuset); | |
thread = pthread_self(); | |
res = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); | |
if (res != 0) { | |
perror("pthread_setaffinity_np"); | |
} | |
} | |
uint32_t* init_hashes() { | |
uint32_t* __attribute__((aligned(16))) hashes = | |
(uint32_t*)aligned_alloc(16, sizeof(uint32_t) * 16); | |
for(uint32_t i = 0; i < 16; i++) { | |
uint32_t v = i; | |
hashes[i] = v; | |
} | |
return hashes; | |
} | |
#if 0 | |
// Can't test, I relied on godbolt to implement it https://godbolt.org/z/8fgAt7 | |
int8_t hash32_find_14_avx512(uint32_t hash, volatile uint32_t* hashes) { | |
__mmask16 compacted_result_mask; | |
__m512i cmp_vector, ring_vector; | |
cmp_vector = _mm512_set1_epi32(hash); | |
ring_vector = _mm512_stream_load_si512((__m512i*)hashes); | |
compacted_result_mask = _mm512_cmpeq_epi32_mask(ring_vector, cmp_vector); | |
return _tzcnt_u32(compacted_result_mask); | |
} | |
#endif | |
/** | |
* Implemented by @chtz | |
* | |
* https://stackoverflow.com/a/62123631/169278 | |
**/ | |
int8_t hash32_find_14_avx2_branchless(uint32_t hash, volatile uint32_t* hashes) { | |
uint32_t compacted_result_mask = 0; | |
__m256i cmp_vector = _mm256_set1_epi32(hash); | |
for(uint8_t base_index = 0; base_index < 12; base_index += 6) { | |
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index)); | |
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector); | |
compacted_result_mask |= _mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index); | |
} | |
return _tzcnt_u32(compacted_result_mask); | |
} | |
int8_t hash32_find_14_avx2(uint32_t hash, volatile uint32_t* hashes) { | |
uint32_t compacted_result_mask, leading_zeroes; | |
__m256i cmp_vector, ring_vector, result_mask_vector; | |
int8_t found_index = -1; | |
if (hashes[0] == hash) { | |
return 0; | |
} | |
cmp_vector = _mm256_set1_epi32(hash); | |
for(uint8_t base_index = 0; base_index < 14; base_index += 8) { | |
ring_vector = _mm256_stream_load_si256((__m256i*) (hashes + base_index)); | |
result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector); | |
compacted_result_mask = _mm256_movemask_epi8(result_mask_vector); | |
if (compacted_result_mask != 0) { | |
leading_zeroes = 32 - __builtin_clz(compacted_result_mask); | |
found_index = base_index + (leading_zeroes >> 2u) - 1; | |
break; | |
} | |
} | |
return found_index > 13 ? 32 : found_index; | |
} | |
int8_t hash32_find_14_loop(uint32_t hash, volatile uint32_t* hashes) { | |
for(uint8_t index = 0; index < 14; index++) { | |
if (hashes[index] == hash) { | |
return index; | |
} | |
} | |
return 32; | |
} | |
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, SUFFIX, ...) \ | |
void bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX(benchmark::State& state) { \ | |
uint32_t* hashes = NULL; \ | |
hashtable_support_hash_search_fp_t hashtable_support_hash_search_method = hash32_find_14_##METHOD; \ | |
\ | |
set_thread_affinity(state.thread_index); \ | |
\ | |
for (auto _ : state) { \ | |
hashes = init_hashes(); \ | |
__VA_ARGS__ \ | |
free(hashes); \ | |
} \ | |
} \ | |
BENCHMARK(bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX) \ | |
->Iterations(10000000) \ | |
->Threads(1)->Threads(2)->Threads(4)->Threads(8)->Threads(16)->Threads(32)->Threads(64); | |
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD) \ | |
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, full, { \ | |
uint32_t hashes[] = { 8, 1, 13, 4, 9, 0, 5, 11, 3, 12, 7, 2, 15, 14, 6, 10 }; \ | |
for(uint8_t i = 0; i < sizeof(hashes) / sizeof(uint32_t); i++) { \ | |
benchmark::DoNotOptimize(hashtable_support_hash_search_method(hashes[i], hashes)); \ | |
} \ | |
}) | |
#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(METHOD) \ | |
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD); | |
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(loop); | |
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2); | |
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2_branchless); | |
BENCHMARK_MAIN(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment