danielealbano/bench.cpp

## bench.cpp
/**
 * For reference check this stackoverflow thread
 * https://stackoverflow.com/questions/62120797/find-element-index-with-avx2-code-optimization
 **/

#include <stdint.h>
#include <immintrin.h>
#include <benchmark/benchmark.h>

#define CPU_CORE_LOGICAL_COUNT 12

void set_thread_affinity(int thread_index) {
    int res;
    cpu_set_t cpuset;
    pthread_t thread;

    CPU_ZERO(&cpuset);
    CPU_SET(thread_index % CPU_CORE_LOGICAL_COUNT, &cpuset);

    thread = pthread_self();
    res = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    if (res != 0) {
        perror("pthread_setaffinity_np");
    }
}

uint32_t* init_hashes() {
    uint32_t* __attribute__((aligned(16))) hashes =
            (uint32_t*)aligned_alloc(16, sizeof(uint32_t) * 16);

    for(uint32_t i = 0; i < 16; i++) {
        uint32_t v = i;
        hashes[i] = v;
    }

    return hashes;
}

#if 0
// Can't test, I relied on godbolt to implement it https://godbolt.org/z/8fgAt7
int8_t hash32_find_14_avx512(uint32_t hash, volatile uint32_t* hashes) {
    __mmask16  compacted_result_mask;
    __m512i cmp_vector, ring_vector;

    cmp_vector = _mm512_set1_epi32(hash);
    ring_vector = _mm512_stream_load_si512((__m512i*)hashes);

    compacted_result_mask = _mm512_cmpeq_epi32_mask(ring_vector, cmp_vector);

    return _tzcnt_u32(compacted_result_mask);
}
#endif

/**
 * Implemented by @chtz
 *
 * https://stackoverflow.com/a/62123631/169278
 **/
int8_t hash32_find_14_avx2_branchless(uint32_t hash, volatile uint32_t* hashes) {
    uint32_t compacted_result_mask = 0;
    __m256i cmp_vector = _mm256_set1_epi32(hash);
    for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
        __m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));

        __m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
        compacted_result_mask |= _mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
    }
    return _tzcnt_u32(compacted_result_mask);
}

int8_t hash32_find_14_avx2(uint32_t hash, volatile uint32_t* hashes) {
    uint32_t compacted_result_mask, leading_zeroes;
    __m256i cmp_vector, ring_vector, result_mask_vector;
    int8_t found_index = -1;

    if (hashes[0] == hash) {
        return 0;
    }

    cmp_vector = _mm256_set1_epi32(hash);
    for(uint8_t base_index = 0; base_index < 14; base_index += 8) {
        ring_vector = _mm256_stream_load_si256((__m256i*) (hashes + base_index));

        result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
        compacted_result_mask = _mm256_movemask_epi8(result_mask_vector);

        if (compacted_result_mask != 0) {
            leading_zeroes = 32 - __builtin_clz(compacted_result_mask);
            found_index = base_index + (leading_zeroes >> 2u) - 1;
            break;
        }
    }

    return found_index > 13 ? 32 : found_index;
}

int8_t hash32_find_14_loop(uint32_t hash, volatile uint32_t* hashes) {
    for(uint8_t index = 0; index < 14; index++) {
        if (hashes[index] == hash) {
            return index;
        }
    }

    return 32;
}

#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, SUFFIX, ...) \
    void bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX(benchmark::State& state) { \
        uint32_t* hashes = NULL; \
        hashtable_support_hash_search_fp_t hashtable_support_hash_search_method = hash32_find_14_##METHOD; \
        \
        set_thread_affinity(state.thread_index); \
        \
        for (auto _ : state) { \
            hashes = init_hashes(); \
            __VA_ARGS__ \
            free(hashes); \
        } \
    } \
    BENCHMARK(bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX) \
        ->Iterations(10000000) \
        ->Threads(1)->Threads(2)->Threads(4)->Threads(8)->Threads(16)->Threads(32)->Threads(64);

#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD) \
    BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, full, { \
        uint32_t hashes[] =  { 8, 1, 13, 4, 9, 0, 5, 11, 3, 12, 7, 2, 15, 14, 6, 10 }; \
        for(uint8_t i = 0; i < sizeof(hashes) / sizeof(uint32_t); i++) { \
            benchmark::DoNotOptimize(hashtable_support_hash_search_method(hashes[i], hashes)); \
        } \
    })

#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(METHOD) \
    BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD);

BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(loop);
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2);
BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2_branchless);

BENCHMARK_MAIN();
	/**
	* For reference check this stackoverflow thread
	* https://stackoverflow.com/questions/62120797/find-element-index-with-avx2-code-optimization
	**/

	#include <stdint.h>
	#include <immintrin.h>
	#include <benchmark/benchmark.h>

	#define CPU_CORE_LOGICAL_COUNT 12

	void set_thread_affinity(int thread_index) {
	int res;
	cpu_set_t cpuset;
	pthread_t thread;

	CPU_ZERO(&cpuset);
	CPU_SET(thread_index % CPU_CORE_LOGICAL_COUNT, &cpuset);

	thread = pthread_self();
	res = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
	if (res != 0) {
	perror("pthread_setaffinity_np");
	}
	}

	uint32_t* init_hashes() {
	uint32_t* __attribute__((aligned(16))) hashes =
	(uint32_t)aligned_alloc(16, sizeof(uint32_t) 16);

	for(uint32_t i = 0; i < 16; i++) {
	uint32_t v = i;
	hashes[i] = v;
	}

	return hashes;
	}

	#if 0
	// Can't test, I relied on godbolt to implement it https://godbolt.org/z/8fgAt7
	int8_t hash32_find_14_avx512(uint32_t hash, volatile uint32_t* hashes) {
	__mmask16 compacted_result_mask;
	__m512i cmp_vector, ring_vector;

	cmp_vector = _mm512_set1_epi32(hash);
	ring_vector = _mm512_stream_load_si512((__m512i*)hashes);

	compacted_result_mask = _mm512_cmpeq_epi32_mask(ring_vector, cmp_vector);

	return _tzcnt_u32(compacted_result_mask);
	}
	#endif

	/**
	* Implemented by @chtz
	*
	* https://stackoverflow.com/a/62123631/169278
	**/
	int8_t hash32_find_14_avx2_branchless(uint32_t hash, volatile uint32_t* hashes) {
	uint32_t compacted_result_mask = 0;
	__m256i cmp_vector = _mm256_set1_epi32(hash);
	for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
	__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));

	__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
	compacted_result_mask \|= _mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
	}
	return _tzcnt_u32(compacted_result_mask);
	}

	int8_t hash32_find_14_avx2(uint32_t hash, volatile uint32_t* hashes) {
	uint32_t compacted_result_mask, leading_zeroes;
	__m256i cmp_vector, ring_vector, result_mask_vector;
	int8_t found_index = -1;

	if (hashes[0] == hash) {
	return 0;
	}

	cmp_vector = _mm256_set1_epi32(hash);
	for(uint8_t base_index = 0; base_index < 14; base_index += 8) {
	ring_vector = _mm256_stream_load_si256((__m256i*) (hashes + base_index));

	result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
	compacted_result_mask = _mm256_movemask_epi8(result_mask_vector);

	if (compacted_result_mask != 0) {
	leading_zeroes = 32 - __builtin_clz(compacted_result_mask);
	found_index = base_index + (leading_zeroes >> 2u) - 1;
	break;
	}
	}

	return found_index > 13 ? 32 : found_index;
	}

	int8_t hash32_find_14_loop(uint32_t hash, volatile uint32_t* hashes) {
	for(uint8_t index = 0; index < 14; index++) {
	if (hashes[index] == hash) {
	return index;
	}
	}

	return 32;
	}

	#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, SUFFIX, ...) \
	void bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX(benchmark::State& state) { \
	uint32_t* hashes = NULL; \
	hashtable_support_hash_search_fp_t hashtable_support_hash_search_method = hash32_find_14_##METHOD; \
	\
	set_thread_affinity(state.thread_index); \
	\
	for (auto _ : state) { \
	hashes = init_hashes(); \
	__VA_ARGS__ \
	free(hashes); \
	} \
	} \
	BENCHMARK(bench_template_hashtable_support_hash_search_##METHOD##_##SUFFIX) \
	->Iterations(10000000) \
	->Threads(1)->Threads(2)->Threads(4)->Threads(8)->Threads(16)->Threads(32)->Threads(64);

	#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD) \
	BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FUNC_WRAPPER(METHOD, full, { \
	uint32_t hashes[] = { 8, 1, 13, 4, 9, 0, 5, 11, 3, 12, 7, 2, 15, 14, 6, 10 }; \
	for(uint8_t i = 0; i < sizeof(hashes) / sizeof(uint32_t); i++) { \
	benchmark::DoNotOptimize(hashtable_support_hash_search_method(hashes[i], hashes)); \
	} \
	})

	#define BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(METHOD) \
	BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_FULL(METHOD);

	BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(loop);
	BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2);
	BENCH_TEMPLATE_HASHTABLE_SUPPORT_HASH_SEARCH_ALL(avx2_branchless);

	BENCHMARK_MAIN();