Create a gist now

Instantly share code, notes, and snippets.

#include <iostream>
#include <x86intrin.h>
#include <boost/timer/timer.hpp>
inline __m128i mm_delta_swap_epi64(__m128i a, __m128i mask, int delta) {
__m128i x = _mm_and_si128(_mm_xor_si128(a, _mm_srli_epi64(a, delta)), mask);
return _mm_xor_si128(_mm_xor_si128(a, x), _mm_slli_epi64(x, delta));
}
inline __m128i mm_unpacklo_epb_unpack_dswap(__m128i a, __m128i b) {
__m128i unpack8 = _mm_unpacklo_epi8(a, b);
__m128i unpack4 = mm_delta_swap_epi64(unpack8, _mm_set1_epi16(0x00F0), 4);
__m128i unpack2 = mm_delta_swap_epi64(unpack4, _mm_set1_epi8(0x0C), 2);
return mm_delta_swap_epi64(unpack2, _mm_set1_epi8(0x22), 1);
}
inline __m128i mm_unpackhi_epb_unpack_dswap(__m128i a, __m128i b) {
__m128i unpack8 = _mm_unpackhi_epi8(a, b);
__m128i unpack4 = mm_delta_swap_epi64(unpack8, _mm_set1_epi16(0x00F0), 4);
__m128i unpack2 = mm_delta_swap_epi64(unpack4, _mm_set1_epi8(0x0C), 2);
return mm_delta_swap_epi64(unpack2, _mm_set1_epi8(0x22), 1);
}
inline __m128i mm_unpacklo_epb_pdep(__m128i a, __m128i b) {
uint64_t alo = _mm_cvtsi128_si64(a);
uint64_t blo = _mm_cvtsi128_si64(b);
return _mm_set_epi64x(
_pdep_u64(alo >> 32, UINT64_C(0x5555555555555555)) | _pdep_u64(blo >> 32, UINT64_C(0xAAAAAAAAAAAAAAAA)),
_pdep_u64(alo, UINT64_C(0x5555555555555555)) | _pdep_u64(blo, UINT64_C(0xAAAAAAAAAAAAAAAA)));
}
inline __m128i mm_unpackhi_epb_pdep(__m128i a, __m128i b) {
uint64_t ahi = _mm_extract_epi64(a, 1);
uint64_t bhi = _mm_extract_epi64(b, 1);
return _mm_set_epi64x(
_pdep_u64(ahi >> 32, UINT64_C(0x5555555555555555)) | _pdep_u64(bhi >> 32, UINT64_C(0xAAAAAAAAAAAAAAAA)),
_pdep_u64(ahi, UINT64_C(0x5555555555555555)) | _pdep_u64(bhi, UINT64_C(0xAAAAAAAAAAAAAAAA)));
}
inline __m128i mm_unpacklo_epb_pclmulqdq(__m128i a, __m128i b) {
return _mm_or_si128(_mm_clmulepi64_si128(a, a, 0x00), _mm_slli_epi32(_mm_clmulepi64_si128(b, b, 0x00), 1));
}
inline __m128i mm_unpackhi_epb_pclmulqdq(__m128i a, __m128i b) {
return _mm_or_si128(_mm_clmulepi64_si128(a, a, 0x11), _mm_slli_epi32(_mm_clmulepi64_si128(b, b, 0x11), 1));
}
inline __m128i mm_xorshift32_epi32(__m128i seeds) {
seeds = _mm_xor_si128(seeds, _mm_slli_epi32(seeds, 13));
seeds = _mm_xor_si128(seeds, _mm_srli_epi32(seeds, 17));
return seeds = _mm_xor_si128(seeds, _mm_slli_epi32(seeds, 5));
}
#define DEF_BENCH_UNPACKLO_EPB(name) \
void bench_unpacklo_epb_##name() { \
std::cout << "Bench "#name << std::endl; \
__m128i input1 = _mm_setr_epi32(1, 2, 3, 4); \
__m128i input2 = _mm_setr_epi32(5, 6, 7, 8); \
__m128i result = _mm_setzero_si128(); \
boost::timer::cpu_timer timer; \
for (int i = 0; i < 1 << 30; ++i) { \
result = _mm_xor_si128(result, mm_unpacklo_epb_##name(input1, input2)); \
input1 = mm_xorshift32_epi32(input1); \
input2 = mm_xorshift32_epi32(input2); \
} \
std::cout << _mm_extract_epi64(result, 0) << ' ' << _mm_extract_epi64(result, 1) << std::endl; \
std::cout << timer.format(3, "elapsed: %ws") << std::endl; \
}
#define DEF_BENCH_UNPACKHI_EPB(name) \
void bench_unpackhi_epb_##name() { \
std::cout << "Bench "#name << std::endl; \
__m128i input1 = _mm_setr_epi32(1, 2, 3, 4); \
__m128i input2 = _mm_setr_epi32(5, 6, 7, 8); \
__m128i result = _mm_setzero_si128(); \
boost::timer::cpu_timer timer; \
for (int i = 0; i < 1 << 30; ++i) { \
result = _mm_xor_si128(result, mm_unpackhi_epb_##name(input1, input2)); \
input1 = mm_xorshift32_epi32(input1); \
input2 = mm_xorshift32_epi32(input2); \
} \
std::cout << _mm_extract_epi64(result, 0) << ' ' << _mm_extract_epi64(result, 1) << std::endl; \
std::cout << timer.format(3, "elapsed: %ws") << std::endl; \
}
DEF_BENCH_UNPACKLO_EPB(unpack_dswap);
DEF_BENCH_UNPACKLO_EPB(pdep);
DEF_BENCH_UNPACKLO_EPB(pclmulqdq);
DEF_BENCH_UNPACKHI_EPB(unpack_dswap);
DEF_BENCH_UNPACKHI_EPB(pdep);
DEF_BENCH_UNPACKHI_EPB(pclmulqdq);
int main() {
std::cout << "Unpack Lo" << std::endl;
bench_unpacklo_epb_unpack_dswap();
bench_unpacklo_epb_pdep();
bench_unpacklo_epb_pclmulqdq();
std::cout << "Unpack Hi" << std::endl;
bench_unpackhi_epb_unpack_dswap();
bench_unpackhi_epb_pdep();
bench_unpackhi_epb_pclmulqdq();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment