Created
January 3, 2018 15:51
-
-
Save taowen/627390f1db2eb658cabae02b43990193 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <numeric> | |
#include <algorithm> | |
#include <random> | |
#include <vector> | |
#include <iostream> | |
#include <limits> | |
typedef int V; | |
typedef std::vector<V> vi; | |
using SearchFn = long(const vi&, long, V); | |
template <SearchFn* f> | |
static void BM_Search(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != f(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(f(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
#define R {{128,930}} | |
#define M 4 | |
template<int roll> | |
long linUnroll(const vi& arr, const long guessIx, const V x) { | |
for (long i = guessIx;; i += roll) | |
for (long j = 0; j < roll; j++) | |
if (arr[i+j] >= x) return i+j; | |
} | |
//BENCHMARK_TEMPLATE(BM_Search, linUnroll<8>)->RangeMultiplier(M)->Ranges(R); | |
//BENCHMARK_TEMPLATE(BM_Search, linUnroll<32>)->RangeMultiplier(M)->Ranges(R); | |
long linSIMD(const vi& arr, const long guessIx, const V x) { | |
using v4 = V __attribute__ ((vector_size (4*4))); | |
using dv2 = int64_t __attribute__ ((vector_size (4*4))); | |
constexpr int roll = 4; | |
constexpr union { | |
int32_t i32[2]; | |
int64_t i64; | |
} skip = {-2,-2}; | |
v4 xVec = {x,x,x,x}; | |
for (int i = guessIx;; i += roll) { | |
v4 arrVec; | |
for (long j = 0; j < 4; j++) arrVec[j] = arr[i+j]; | |
union { | |
v4 i32; | |
dv2 i64; | |
} cmpVec = {arrVec < xVec}; | |
v4 cmpVec2 = {cmpVec.i32[3], cmpVec.i32[2], cmpVec.i32[1],cmpVec.i32[0]}; | |
cmpVec.i32 += cmpVec2; | |
if (cmpVec.i64[0] == skip.i64) continue; | |
return i - cmpVec.i32[0] - cmpVec.i32[1]; | |
} | |
} | |
static void BM_SIMD(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
//BENCHMARK(BM_SIMD)->RangeMultiplier(M)->Ranges(R); | |
long lin2(const vi& arr, const long guessIx, const V x) { | |
using v4 = V __attribute__ ((vector_size (4*4))); | |
using dv2 = int64_t __attribute__ ((vector_size (4*4))); | |
constexpr int roll = 4; | |
constexpr union { | |
int32_t i32[2]; | |
int64_t i64; | |
} skip = {-2,-2}; | |
v4 xVec = {x,x,x,x}; | |
for (int i = guessIx;;) { | |
for (int k=0;k<2;k++,i+=roll) { | |
v4 arrVec; | |
for (long j = 0; j < 4; j++) arrVec[j] = arr[i+j]; | |
union { | |
v4 i32; | |
dv2 i64; | |
} cmpVec = {arrVec < xVec}; | |
v4 cmpVec2 = {cmpVec.i32[3], cmpVec.i32[2], cmpVec.i32[1],cmpVec.i32[0]}; | |
cmpVec.i32 += cmpVec2; | |
if (cmpVec.i64[0] == skip.i64) continue; | |
return i - cmpVec.i32[0] - cmpVec.i32[1]; | |
} | |
} | |
} | |
static void BM2(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != lin2(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(lin2(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
//BENCHMARK(BM2)->RangeMultiplier(M)->Ranges(R); | |
int linSIMD2(const vi& arr, const int guessIx, const int x) { | |
auto vecX = _mm_set1_epi32(x - 1); | |
const int *ptr = arr.data(); | |
for (int i = guessIx; true; i += 4) { | |
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto cmp = _mm_cmpgt_epi32(arrVec, vecX); | |
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
} | |
} | |
static void BM_SIMD2(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD2(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD2(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
//BENCHMARK(BM_SIMD2)->RangeMultiplier(M)->Ranges(R); | |
int linSIMD3(const vi& arr, const int guessIx, const int x) { | |
auto vecX = _mm_set1_epi32(x - 1); | |
const int *ptr = arr.data(); | |
for (int i = guessIx; true; i += 16) { | |
auto av0 = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto av1 = _mm_loadu_si128((__m128i*)(ptr + i + 4)); | |
auto av2 = _mm_loadu_si128((__m128i*)(ptr + i + 8)); | |
auto av3 = _mm_loadu_si128((__m128i*)(ptr + i + 12)); | |
auto cmp0 = _mm_cmpgt_epi32(av0, vecX); | |
auto cmp1 = _mm_cmpgt_epi32(av1, vecX); | |
auto cmp2 = _mm_cmpgt_epi32(av2, vecX); | |
auto cmp3 = _mm_cmpgt_epi32(av3, vecX); | |
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3)); | |
int mask = _mm_movemask_epi8(cmp); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
} | |
} | |
static void BM_SIMD3(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD3(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD3(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
BENCHMARK(BM_SIMD3)->RangeMultiplier(M)->Ranges(R); | |
int linSIMD4(const vi& arr, const int guessIx, const int x) { | |
auto vecX = _mm_set1_epi32(x - 1); | |
const int *ptr = arr.data(); | |
int i = guessIx; | |
// unaligned start | |
int misalignment = (uintptr_t)(ptr + i) & 15; | |
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto cmp = _mm_cmpgt_epi32(arrVec, vecX); | |
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
// continue with aligned part | |
i += (16 - misalignment) / 4; | |
for (; ; i += 16) { | |
auto av0 = _mm_load_si128((__m128i*)(ptr + i)); | |
auto av1 = _mm_load_si128((__m128i*)(ptr + i + 4)); | |
auto av2 = _mm_load_si128((__m128i*)(ptr + i + 8)); | |
auto av3 = _mm_load_si128((__m128i*)(ptr + i + 12)); | |
auto cmp0 = _mm_cmpgt_epi32(av0, vecX); | |
auto cmp1 = _mm_cmpgt_epi32(av1, vecX); | |
auto cmp2 = _mm_cmpgt_epi32(av2, vecX); | |
auto cmp3 = _mm_cmpgt_epi32(av3, vecX); | |
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3)); | |
int mask = _mm_movemask_epi8(cmp); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
} | |
} | |
static void BM_SIMD4(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD4(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD4(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
BENCHMARK(BM_SIMD4)->RangeMultiplier(M)->Ranges(R); | |
int linSIMD5(const vi& arr, const int guessIx, const int x) { | |
auto vecX = _mm_set1_epi32(x); | |
const int *ptr = arr.data(); | |
int i = guessIx; | |
// unaligned start | |
int misalignment = (uintptr_t)(ptr + i) & 15; | |
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto cmp = _mm_cmpgt_epi32(vecX, arrVec); | |
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)) ^ 0xF; | |
if (mask) | |
return i + __builtin_ctz(mask); | |
// continue with aligned part | |
i += (16 - misalignment) / 4; | |
for (; ; i += 16) { | |
auto av0 = _mm_load_si128((__m128i*)(ptr + i)); | |
auto av1 = _mm_load_si128((__m128i*)(ptr + i + 4)); | |
auto av2 = _mm_load_si128((__m128i*)(ptr + i + 8)); | |
auto av3 = _mm_load_si128((__m128i*)(ptr + i + 12)); | |
auto cmp0 = _mm_cmpgt_epi32(vecX, av0); | |
auto cmp1 = _mm_cmpgt_epi32(vecX, av1); | |
auto cmp2 = _mm_cmpgt_epi32(vecX, av2); | |
auto cmp3 = _mm_cmpgt_epi32(vecX, av3); | |
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3)); | |
int mask = _mm_movemask_epi8(cmp); | |
if (mask != 0xFFFF) | |
return i + __builtin_ctz(~mask); | |
} | |
} | |
static void BM_SIMD5(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD5(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD5(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
BENCHMARK(BM_SIMD5)->RangeMultiplier(M)->Ranges(R); | |
/* | |
int linSIMD6(const vi& arr, const int guessIx, const int x) { | |
static const int reorder[32] = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; | |
auto vecX = _mm_set1_epi32(x - 1); | |
auto vecXX = _mm256_set1_epi32(x - 1); | |
const int *ptr = arr.data(); | |
int i = guessIx; | |
// unaligned start | |
int misalignment = (uintptr_t)(ptr + i) & 15; | |
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto cmp = _mm_cmpgt_epi32(arrVec, vecX); | |
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
// continue with 16-aligned part | |
i += (16 - misalignment) / 4; | |
// if 32-unaligned, do an other 16-byte block | |
if ((uintptr_t)(ptr + i) & 31) { | |
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i)); | |
auto cmp = _mm_cmpgt_epi32(arrVec, vecX); | |
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)); | |
if (mask) | |
return i + __builtin_ctz(mask); | |
i += 4; | |
} | |
// 32-aligned main loop | |
for (; ; i += 32) { | |
auto av0 = _mm256_load_si256((__m256i*)(ptr + i)); | |
auto av1 = _mm256_load_si256((__m256i*)(ptr + i + 8)); | |
auto av2 = _mm256_load_si256((__m256i*)(ptr + i + 16)); | |
auto av3 = _mm256_load_si256((__m256i*)(ptr + i + 24)); | |
auto cmp0 = _mm256_cmpgt_epi32(av0, vecXX); | |
auto cmp1 = _mm256_cmpgt_epi32(av1, vecXX); | |
auto cmp2 = _mm256_cmpgt_epi32(av2, vecXX); | |
auto cmp3 = _mm256_cmpgt_epi32(av3, vecXX); | |
auto cmp = _mm256_packs_epi16(_mm256_packs_epi32(cmp0, cmp1), _mm256_packs_epi32(cmp2, cmp3)); | |
int mask = _mm256_movemask_epi8(cmp); | |
if (mask) | |
return i + reorder[__builtin_ctz(mask)]; | |
} | |
} | |
static void BM_SIMD6(benchmark::State& state) { | |
using std::cerr; using std::endl; | |
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42; | |
vi v(vSz+4, std::numeric_limits<V>::max()); | |
std::iota(v.begin(), v.begin() + vSz, 0); | |
vi v2(v.begin() + dist, v.begin() + vSz); | |
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed)); | |
const vi k(v2.begin(), v2.begin() + kSz); | |
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD6(v,k[i]-dist,k[i])) exit(1);} | |
while (state.KeepRunning()) { | |
for (int i = 0; i < kSz; i++) { | |
benchmark::DoNotOptimize(linSIMD6(v, k[i]-dist, k[i])); | |
} | |
} | |
} | |
BENCHMARK(BM_SIMD6)->RangeMultiplier(M)->Ranges(R); | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment