Skip to content

Instantly share code, notes, and snippets.

@comex
Last active February 27, 2017 21:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save comex/3694b05f1eb6347b8a5d2d3b4dc93e0a to your computer and use it in GitHub Desktop.
Save comex/3694b05f1eb6347b8a5d2d3b4dc93e0a to your computer and use it in GitHub Desktop.
'dummy' is just two rdtscps in a row plus some shifting instructions generated to interpret the value.
clang-800.0.42.1:
smolset_lookup_simd: cycles=26.260086
smolset_lookup_naive_linear: cycles=34.332955
smolset_lookup_linear_unrollable: cycles=33.906475
smolset_lookup_linear_manualunroll: cycles=32.245281
smolset_lookup_binary_manualunroll: cycles=47.629102
smolset_lookup_naive_binary: cycles=41.811521
smolset_lookup_dummy: cycles=23.348474
gcc 6.3.0
smolset_lookup_simd: cycles=29.192634
smolset_lookup_naive_linear: cycles=32.452794
smolset_lookup_linear_unrollable: cycles=37.402911
smolset_lookup_linear_manualunroll: cycles=35.749781
smolset_lookup_binary_manualunroll: cycles=44.206753
smolset_lookup_naive_binary: cycles=43.939348
smolset_lookup_dummy: cycles=23.582586
(there's a lot of variability but they're usually about equal between the compilers)
#include <immintrin.h>
#include <x86intrin.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#define STATIC static
#define black_box(x) ({ \
__auto_type _x = (x); \
asm ("" : "+r"(_x)); \
_x; \
})
struct smolset {
union {
__m256i simd;
uint32_t array[8];
} keys;
uint32_t count;
} __attribute__((aligned(32)));
STATIC int smolset_lookup_simd(const struct smolset *ss, uint32_t key) {
__m256i cmp = _mm256_set1_epi32((int) key); // each element is key
__m256i eqs = _mm256_cmpeq_epi32(ss->keys.simd, cmp); // each element is -1 if equal or 0
uint32_t bits = (uint32_t) _mm256_movemask_ps((__m256) eqs); // low 8 bits are 1 if equal or 0
uint32_t count = ss->count;
unsigned trailing_zeros = _tzcnt_u32(bits);
return trailing_zeros >= count ? -1 : trailing_zeros;
}
STATIC int smolset_lookup_naive_linear(const struct smolset *ss, uint32_t key) {
for (unsigned i = 0; i < ss->count; i++)
if (ss->keys.array[i] == key)
return i;
return -1;
}
STATIC int smolset_lookup_linear_unrollable(const struct smolset *ss, uint32_t key) {
for (unsigned i = 0; i < 8; i++) {
if (ss->keys.array[i] == key && i < ss->count)
return i;
}
return -1;
}
#define X_0to7 X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7)
STATIC int smolset_lookup_linear_manualunroll(const struct smolset *ss, uint32_t key) {
#define X(i) bool eq##i = ss->keys.array[i] == key;
X_0to7
#undef X
#define X(i) if (eq##i && i < ss->count) return i;
X_0to7
#undef X
return -1;
}
STATIC int smolset_lookup_naive_binary(const struct smolset *ss, uint32_t key) {
unsigned lo = 0, hi = ss->count;
while (lo < hi) {
unsigned mid = (lo + hi) / 2;
uint32_t this_key = ss->keys.array[mid];
//printf("lo=%u hi=%u mid=%u key=0x%x this_key=0x%x\n", lo, hi, mid, key, this_key);
if (this_key == key)
return mid;
else if (this_key < key)
lo = mid + 1;
else
hi = mid;
}
return -1;
}
#define Y_0to2 Y() Y() Y()
STATIC int smolset_lookup_binary_manualunroll(const struct smolset *ss, uint32_t key) {
unsigned count = ss->count, lo = 0, hi = count, mid;
uint32_t this_key;
#define Y() \
mid = (lo + hi) / 2; \
this_key = ss->keys.array[mid]; \
/*printf("lo=%d hi=%d mid=%d this_key=%x key=%x\n", lo, hi, mid, this_key, key);*/ \
if (this_key < key) \
lo = (mid + 1) & 7; \
else if (this_key > key) \
hi = mid;
Y() Y() Y() Y()
if (this_key == key && mid < count)
return mid;
else
return -1;
}
STATIC int smolset_lookup_dummy(const struct smolset *ss, uint32_t key) {
return -1;
}
// These are not currently optimized, at all
static void smolset_init(struct smolset *ss) {
memset(ss, 0, sizeof(*ss));
}
static void smolset_add(struct smolset *ss, uint32_t val) {
if (ss->count == 8)
abort();
int i;
for (i = 0; i < ss->count && ss->keys.array[i] < val; i++)
;
memmove(&ss->keys.array[i+1], &ss->keys.array[i], (ss->count - i) * sizeof(uint32_t));
ss->keys.array[i] = val;
ss->count++;
}
static void smolset_print(const struct smolset *ss) {
printf("[%u] ", ss->count);
for (uint32_t i = 0; i < ss->count; i++)
printf("0x%x, ", ss->keys.array[i]);
printf("\n");
}
#define TEST_ARRAY_COUNT 32
#define TEST_COUNT 1048576
#define TIME_START_AT_TEST 100000
static_assert(TIME_START_AT_TEST < TEST_COUNT, "lol");
static uint64_t lolseed;
static void lolsrand(uint64_t val) {
lolseed = val;
}
static uint32_t lolrand() {
uint64_t prev = lolseed;
lolseed = prev * 2862933555777941757 + 3037000493;
return (uint32_t) prev;
}
__attribute__((always_inline))
static inline uint64_t test(bool check, int (*func)(const struct smolset *, uint32_t)) {
lolsrand(0xdeadbeef);
struct smolset test_sets[TEST_ARRAY_COUNT];
for (int i = 0; i < TEST_ARRAY_COUNT; i++) {
smolset_init(&test_sets[i]);
int count = lolrand() % 9;
for (int j = 0; j < count; j++)
smolset_add(&test_sets[i], lolrand());
}
uint32_t checksum = 0;
uint64_t total_cycles = 0;
for (int i = 0; i < TEST_COUNT; i++) {
unsigned which_set = lolrand() % 8;
// either a present or absent value
uint32_t val;
const struct smolset *ts = &test_sets[which_set];
switch (lolrand() % 3) {
case 0: val = lolrand(); break;
case 1: val = 0; break;
case 2: val = ts->keys.array[ts->count ? (lolrand() % ts->count) : 0]; break;
}
which_set = black_box(which_set);
val = black_box(val);
unsigned aux;
uint64_t start = __rdtscp(&aux);
asm volatile("");
int ret = func(&test_sets[which_set], val);
asm volatile("");
uint64_t cycles = __rdtscp(&aux) - start;
if (0) {
int check = smolset_lookup_naive_linear(&test_sets[which_set], val);
if (ret != check) {
smolset_print(&test_sets[which_set]);
printf("val=%x ret=%d check=%d\n", val, ret, check);
abort();
}
}
checksum += (uint32_t) ret;
if (i >= TIME_START_AT_TEST)
total_cycles += cycles;
}
if (check && checksum != 0xfffa78b0) {
printf("bad checksum\n");
abort();
}
return total_cycles;
}
#define CYCLES 10
#define EACH_FUNC(x) \
x(true, smolset_lookup_simd) \
x(true, smolset_lookup_naive_linear) \
x(true, smolset_lookup_linear_unrollable) \
x(true, smolset_lookup_linear_manualunroll) \
x(true, smolset_lookup_binary_manualunroll) \
x(true, smolset_lookup_naive_binary) \
x(false, smolset_lookup_dummy)
int main() {
uint64_t results[10] = {0};
for (int i = 0; i < CYCLES; i++) {
int j = 0;
#define X(check, name) results[j++] += test(check, name);
EACH_FUNC(X)
#undef X
}
{
int j = 0;
#define X(check, name) printf("%40s: cycles=%f\n", #name, (double) results[j++] / (CYCLES * TEST_COUNT));
EACH_FUNC(X)
#undef X
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment