Skip to content

Instantly share code, notes, and snippets.

@taowen
Created January 3, 2018 15:51
Show Gist options
  • Save taowen/627390f1db2eb658cabae02b43990193 to your computer and use it in GitHub Desktop.
Save taowen/627390f1db2eb658cabae02b43990193 to your computer and use it in GitHub Desktop.
#include <numeric>
#include <algorithm>
#include <random>
#include <vector>
#include <iostream>
#include <limits>
typedef int V;
typedef std::vector<V> vi;
using SearchFn = long(const vi&, long, V);
template <SearchFn* f>
static void BM_Search(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != f(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(f(v, k[i]-dist, k[i]));
}
}
}
#define R {{128,930}}
#define M 4
template<int roll>
long linUnroll(const vi& arr, const long guessIx, const V x) {
for (long i = guessIx;; i += roll)
for (long j = 0; j < roll; j++)
if (arr[i+j] >= x) return i+j;
}
//BENCHMARK_TEMPLATE(BM_Search, linUnroll<8>)->RangeMultiplier(M)->Ranges(R);
//BENCHMARK_TEMPLATE(BM_Search, linUnroll<32>)->RangeMultiplier(M)->Ranges(R);
long linSIMD(const vi& arr, const long guessIx, const V x) {
using v4 = V __attribute__ ((vector_size (4*4)));
using dv2 = int64_t __attribute__ ((vector_size (4*4)));
constexpr int roll = 4;
constexpr union {
int32_t i32[2];
int64_t i64;
} skip = {-2,-2};
v4 xVec = {x,x,x,x};
for (int i = guessIx;; i += roll) {
v4 arrVec;
for (long j = 0; j < 4; j++) arrVec[j] = arr[i+j];
union {
v4 i32;
dv2 i64;
} cmpVec = {arrVec < xVec};
v4 cmpVec2 = {cmpVec.i32[3], cmpVec.i32[2], cmpVec.i32[1],cmpVec.i32[0]};
cmpVec.i32 += cmpVec2;
if (cmpVec.i64[0] == skip.i64) continue;
return i - cmpVec.i32[0] - cmpVec.i32[1];
}
}
static void BM_SIMD(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD(v, k[i]-dist, k[i]));
}
}
}
//BENCHMARK(BM_SIMD)->RangeMultiplier(M)->Ranges(R);
long lin2(const vi& arr, const long guessIx, const V x) {
using v4 = V __attribute__ ((vector_size (4*4)));
using dv2 = int64_t __attribute__ ((vector_size (4*4)));
constexpr int roll = 4;
constexpr union {
int32_t i32[2];
int64_t i64;
} skip = {-2,-2};
v4 xVec = {x,x,x,x};
for (int i = guessIx;;) {
for (int k=0;k<2;k++,i+=roll) {
v4 arrVec;
for (long j = 0; j < 4; j++) arrVec[j] = arr[i+j];
union {
v4 i32;
dv2 i64;
} cmpVec = {arrVec < xVec};
v4 cmpVec2 = {cmpVec.i32[3], cmpVec.i32[2], cmpVec.i32[1],cmpVec.i32[0]};
cmpVec.i32 += cmpVec2;
if (cmpVec.i64[0] == skip.i64) continue;
return i - cmpVec.i32[0] - cmpVec.i32[1];
}
}
}
static void BM2(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != lin2(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(lin2(v, k[i]-dist, k[i]));
}
}
}
//BENCHMARK(BM2)->RangeMultiplier(M)->Ranges(R);
int linSIMD2(const vi& arr, const int guessIx, const int x) {
auto vecX = _mm_set1_epi32(x - 1);
const int *ptr = arr.data();
for (int i = guessIx; true; i += 4) {
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i));
auto cmp = _mm_cmpgt_epi32(arrVec, vecX);
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp));
if (mask)
return i + __builtin_ctz(mask);
}
}
static void BM_SIMD2(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD2(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD2(v, k[i]-dist, k[i]));
}
}
}
//BENCHMARK(BM_SIMD2)->RangeMultiplier(M)->Ranges(R);
int linSIMD3(const vi& arr, const int guessIx, const int x) {
auto vecX = _mm_set1_epi32(x - 1);
const int *ptr = arr.data();
for (int i = guessIx; true; i += 16) {
auto av0 = _mm_loadu_si128((__m128i*)(ptr + i));
auto av1 = _mm_loadu_si128((__m128i*)(ptr + i + 4));
auto av2 = _mm_loadu_si128((__m128i*)(ptr + i + 8));
auto av3 = _mm_loadu_si128((__m128i*)(ptr + i + 12));
auto cmp0 = _mm_cmpgt_epi32(av0, vecX);
auto cmp1 = _mm_cmpgt_epi32(av1, vecX);
auto cmp2 = _mm_cmpgt_epi32(av2, vecX);
auto cmp3 = _mm_cmpgt_epi32(av3, vecX);
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3));
int mask = _mm_movemask_epi8(cmp);
if (mask)
return i + __builtin_ctz(mask);
}
}
static void BM_SIMD3(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD3(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD3(v, k[i]-dist, k[i]));
}
}
}
BENCHMARK(BM_SIMD3)->RangeMultiplier(M)->Ranges(R);
int linSIMD4(const vi& arr, const int guessIx, const int x) {
auto vecX = _mm_set1_epi32(x - 1);
const int *ptr = arr.data();
int i = guessIx;
// unaligned start
int misalignment = (uintptr_t)(ptr + i) & 15;
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i));
auto cmp = _mm_cmpgt_epi32(arrVec, vecX);
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp));
if (mask)
return i + __builtin_ctz(mask);
// continue with aligned part
i += (16 - misalignment) / 4;
for (; ; i += 16) {
auto av0 = _mm_load_si128((__m128i*)(ptr + i));
auto av1 = _mm_load_si128((__m128i*)(ptr + i + 4));
auto av2 = _mm_load_si128((__m128i*)(ptr + i + 8));
auto av3 = _mm_load_si128((__m128i*)(ptr + i + 12));
auto cmp0 = _mm_cmpgt_epi32(av0, vecX);
auto cmp1 = _mm_cmpgt_epi32(av1, vecX);
auto cmp2 = _mm_cmpgt_epi32(av2, vecX);
auto cmp3 = _mm_cmpgt_epi32(av3, vecX);
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3));
int mask = _mm_movemask_epi8(cmp);
if (mask)
return i + __builtin_ctz(mask);
}
}
static void BM_SIMD4(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD4(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD4(v, k[i]-dist, k[i]));
}
}
}
BENCHMARK(BM_SIMD4)->RangeMultiplier(M)->Ranges(R);
int linSIMD5(const vi& arr, const int guessIx, const int x) {
auto vecX = _mm_set1_epi32(x);
const int *ptr = arr.data();
int i = guessIx;
// unaligned start
int misalignment = (uintptr_t)(ptr + i) & 15;
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i));
auto cmp = _mm_cmpgt_epi32(vecX, arrVec);
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp)) ^ 0xF;
if (mask)
return i + __builtin_ctz(mask);
// continue with aligned part
i += (16 - misalignment) / 4;
for (; ; i += 16) {
auto av0 = _mm_load_si128((__m128i*)(ptr + i));
auto av1 = _mm_load_si128((__m128i*)(ptr + i + 4));
auto av2 = _mm_load_si128((__m128i*)(ptr + i + 8));
auto av3 = _mm_load_si128((__m128i*)(ptr + i + 12));
auto cmp0 = _mm_cmpgt_epi32(vecX, av0);
auto cmp1 = _mm_cmpgt_epi32(vecX, av1);
auto cmp2 = _mm_cmpgt_epi32(vecX, av2);
auto cmp3 = _mm_cmpgt_epi32(vecX, av3);
auto cmp = _mm_packs_epi16(_mm_packs_epi32(cmp0, cmp1), _mm_packs_epi32(cmp2, cmp3));
int mask = _mm_movemask_epi8(cmp);
if (mask != 0xFFFF)
return i + __builtin_ctz(~mask);
}
}
static void BM_SIMD5(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD5(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD5(v, k[i]-dist, k[i]));
}
}
}
BENCHMARK(BM_SIMD5)->RangeMultiplier(M)->Ranges(R);
/*
int linSIMD6(const vi& arr, const int guessIx, const int x) {
static const int reorder[32] = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
auto vecX = _mm_set1_epi32(x - 1);
auto vecXX = _mm256_set1_epi32(x - 1);
const int *ptr = arr.data();
int i = guessIx;
// unaligned start
int misalignment = (uintptr_t)(ptr + i) & 15;
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i));
auto cmp = _mm_cmpgt_epi32(arrVec, vecX);
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp));
if (mask)
return i + __builtin_ctz(mask);
// continue with 16-aligned part
i += (16 - misalignment) / 4;
// if 32-unaligned, do an other 16-byte block
if ((uintptr_t)(ptr + i) & 31) {
auto arrVec = _mm_loadu_si128((__m128i*)(ptr + i));
auto cmp = _mm_cmpgt_epi32(arrVec, vecX);
int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp));
if (mask)
return i + __builtin_ctz(mask);
i += 4;
}
// 32-aligned main loop
for (; ; i += 32) {
auto av0 = _mm256_load_si256((__m256i*)(ptr + i));
auto av1 = _mm256_load_si256((__m256i*)(ptr + i + 8));
auto av2 = _mm256_load_si256((__m256i*)(ptr + i + 16));
auto av3 = _mm256_load_si256((__m256i*)(ptr + i + 24));
auto cmp0 = _mm256_cmpgt_epi32(av0, vecXX);
auto cmp1 = _mm256_cmpgt_epi32(av1, vecXX);
auto cmp2 = _mm256_cmpgt_epi32(av2, vecXX);
auto cmp3 = _mm256_cmpgt_epi32(av3, vecXX);
auto cmp = _mm256_packs_epi16(_mm256_packs_epi32(cmp0, cmp1), _mm256_packs_epi32(cmp2, cmp3));
int mask = _mm256_movemask_epi8(cmp);
if (mask)
return i + reorder[__builtin_ctz(mask)];
}
}
static void BM_SIMD6(benchmark::State& state) {
using std::cerr; using std::endl;
const int vSz = 1000, kSz = 64, dist = state.range(0), seed = 42;
vi v(vSz+4, std::numeric_limits<V>::max());
std::iota(v.begin(), v.begin() + vSz, 0);
vi v2(v.begin() + dist, v.begin() + vSz);
std::shuffle(v2.begin(), v2.end(), std::default_random_engine(seed));
const vi k(v2.begin(), v2.begin() + kSz);
for (int i = 0; i < kSz; i++) { if (k[i] != linSIMD6(v,k[i]-dist,k[i])) exit(1);}
while (state.KeepRunning()) {
for (int i = 0; i < kSz; i++) {
benchmark::DoNotOptimize(linSIMD6(v, k[i]-dist, k[i]));
}
}
}
BENCHMARK(BM_SIMD6)->RangeMultiplier(M)->Ranges(R);
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment