Last active
February 27, 2017 21:18
-
-
Save comex/3694b05f1eb6347b8a5d2d3b4dc93e0a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'dummy' is just two rdtscps in a row plus some shifting instructions generated to interpret the value. | |
clang-800.0.42.1: | |
smolset_lookup_simd: cycles=26.260086 | |
smolset_lookup_naive_linear: cycles=34.332955 | |
smolset_lookup_linear_unrollable: cycles=33.906475 | |
smolset_lookup_linear_manualunroll: cycles=32.245281 | |
smolset_lookup_binary_manualunroll: cycles=47.629102 | |
smolset_lookup_naive_binary: cycles=41.811521 | |
smolset_lookup_dummy: cycles=23.348474 | |
gcc 6.3.0 | |
smolset_lookup_simd: cycles=29.192634 | |
smolset_lookup_naive_linear: cycles=32.452794 | |
smolset_lookup_linear_unrollable: cycles=37.402911 | |
smolset_lookup_linear_manualunroll: cycles=35.749781 | |
smolset_lookup_binary_manualunroll: cycles=44.206753 | |
smolset_lookup_naive_binary: cycles=43.939348 | |
smolset_lookup_dummy: cycles=23.582586 | |
(there's a lot of variability but they're usually about equal between the compilers) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <x86intrin.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#define STATIC static | |
#define black_box(x) ({ \ | |
__auto_type _x = (x); \ | |
asm ("" : "+r"(_x)); \ | |
_x; \ | |
}) | |
struct smolset { | |
union { | |
__m256i simd; | |
uint32_t array[8]; | |
} keys; | |
uint32_t count; | |
} __attribute__((aligned(32))); | |
STATIC int smolset_lookup_simd(const struct smolset *ss, uint32_t key) { | |
__m256i cmp = _mm256_set1_epi32((int) key); // each element is key | |
__m256i eqs = _mm256_cmpeq_epi32(ss->keys.simd, cmp); // each element is -1 if equal or 0 | |
uint32_t bits = (uint32_t) _mm256_movemask_ps((__m256) eqs); // low 8 bits are 1 if equal or 0 | |
uint32_t count = ss->count; | |
unsigned trailing_zeros = _tzcnt_u32(bits); | |
return trailing_zeros >= count ? -1 : trailing_zeros; | |
} | |
STATIC int smolset_lookup_naive_linear(const struct smolset *ss, uint32_t key) { | |
for (unsigned i = 0; i < ss->count; i++) | |
if (ss->keys.array[i] == key) | |
return i; | |
return -1; | |
} | |
STATIC int smolset_lookup_linear_unrollable(const struct smolset *ss, uint32_t key) { | |
for (unsigned i = 0; i < 8; i++) { | |
if (ss->keys.array[i] == key && i < ss->count) | |
return i; | |
} | |
return -1; | |
} | |
#define X_0to7 X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) | |
STATIC int smolset_lookup_linear_manualunroll(const struct smolset *ss, uint32_t key) { | |
#define X(i) bool eq##i = ss->keys.array[i] == key; | |
X_0to7 | |
#undef X | |
#define X(i) if (eq##i && i < ss->count) return i; | |
X_0to7 | |
#undef X | |
return -1; | |
} | |
STATIC int smolset_lookup_naive_binary(const struct smolset *ss, uint32_t key) { | |
unsigned lo = 0, hi = ss->count; | |
while (lo < hi) { | |
unsigned mid = (lo + hi) / 2; | |
uint32_t this_key = ss->keys.array[mid]; | |
//printf("lo=%u hi=%u mid=%u key=0x%x this_key=0x%x\n", lo, hi, mid, key, this_key); | |
if (this_key == key) | |
return mid; | |
else if (this_key < key) | |
lo = mid + 1; | |
else | |
hi = mid; | |
} | |
return -1; | |
} | |
#define Y_0to2 Y() Y() Y() | |
STATIC int smolset_lookup_binary_manualunroll(const struct smolset *ss, uint32_t key) { | |
unsigned count = ss->count, lo = 0, hi = count, mid; | |
uint32_t this_key; | |
#define Y() \ | |
mid = (lo + hi) / 2; \ | |
this_key = ss->keys.array[mid]; \ | |
/*printf("lo=%d hi=%d mid=%d this_key=%x key=%x\n", lo, hi, mid, this_key, key);*/ \ | |
if (this_key < key) \ | |
lo = (mid + 1) & 7; \ | |
else if (this_key > key) \ | |
hi = mid; | |
Y() Y() Y() Y() | |
if (this_key == key && mid < count) | |
return mid; | |
else | |
return -1; | |
} | |
STATIC int smolset_lookup_dummy(const struct smolset *ss, uint32_t key) { | |
return -1; | |
} | |
// These are not currently optimized, at all | |
static void smolset_init(struct smolset *ss) { | |
memset(ss, 0, sizeof(*ss)); | |
} | |
static void smolset_add(struct smolset *ss, uint32_t val) { | |
if (ss->count == 8) | |
abort(); | |
int i; | |
for (i = 0; i < ss->count && ss->keys.array[i] < val; i++) | |
; | |
memmove(&ss->keys.array[i+1], &ss->keys.array[i], (ss->count - i) * sizeof(uint32_t)); | |
ss->keys.array[i] = val; | |
ss->count++; | |
} | |
static void smolset_print(const struct smolset *ss) { | |
printf("[%u] ", ss->count); | |
for (uint32_t i = 0; i < ss->count; i++) | |
printf("0x%x, ", ss->keys.array[i]); | |
printf("\n"); | |
} | |
#define TEST_ARRAY_COUNT 32 | |
#define TEST_COUNT 1048576 | |
#define TIME_START_AT_TEST 100000 | |
static_assert(TIME_START_AT_TEST < TEST_COUNT, "lol"); | |
static uint64_t lolseed; | |
static void lolsrand(uint64_t val) { | |
lolseed = val; | |
} | |
static uint32_t lolrand() { | |
uint64_t prev = lolseed; | |
lolseed = prev * 2862933555777941757 + 3037000493; | |
return (uint32_t) prev; | |
} | |
__attribute__((always_inline)) | |
static inline uint64_t test(bool check, int (*func)(const struct smolset *, uint32_t)) { | |
lolsrand(0xdeadbeef); | |
struct smolset test_sets[TEST_ARRAY_COUNT]; | |
for (int i = 0; i < TEST_ARRAY_COUNT; i++) { | |
smolset_init(&test_sets[i]); | |
int count = lolrand() % 9; | |
for (int j = 0; j < count; j++) | |
smolset_add(&test_sets[i], lolrand()); | |
} | |
uint32_t checksum = 0; | |
uint64_t total_cycles = 0; | |
for (int i = 0; i < TEST_COUNT; i++) { | |
unsigned which_set = lolrand() % 8; | |
// either a present or absent value | |
uint32_t val; | |
const struct smolset *ts = &test_sets[which_set]; | |
switch (lolrand() % 3) { | |
case 0: val = lolrand(); break; | |
case 1: val = 0; break; | |
case 2: val = ts->keys.array[ts->count ? (lolrand() % ts->count) : 0]; break; | |
} | |
which_set = black_box(which_set); | |
val = black_box(val); | |
unsigned aux; | |
uint64_t start = __rdtscp(&aux); | |
asm volatile(""); | |
int ret = func(&test_sets[which_set], val); | |
asm volatile(""); | |
uint64_t cycles = __rdtscp(&aux) - start; | |
if (0) { | |
int check = smolset_lookup_naive_linear(&test_sets[which_set], val); | |
if (ret != check) { | |
smolset_print(&test_sets[which_set]); | |
printf("val=%x ret=%d check=%d\n", val, ret, check); | |
abort(); | |
} | |
} | |
checksum += (uint32_t) ret; | |
if (i >= TIME_START_AT_TEST) | |
total_cycles += cycles; | |
} | |
if (check && checksum != 0xfffa78b0) { | |
printf("bad checksum\n"); | |
abort(); | |
} | |
return total_cycles; | |
} | |
#define CYCLES 10 | |
#define EACH_FUNC(x) \ | |
x(true, smolset_lookup_simd) \ | |
x(true, smolset_lookup_naive_linear) \ | |
x(true, smolset_lookup_linear_unrollable) \ | |
x(true, smolset_lookup_linear_manualunroll) \ | |
x(true, smolset_lookup_binary_manualunroll) \ | |
x(true, smolset_lookup_naive_binary) \ | |
x(false, smolset_lookup_dummy) | |
int main() { | |
uint64_t results[10] = {0}; | |
for (int i = 0; i < CYCLES; i++) { | |
int j = 0; | |
#define X(check, name) results[j++] += test(check, name); | |
EACH_FUNC(X) | |
#undef X | |
} | |
{ | |
int j = 0; | |
#define X(check, name) printf("%40s: cycles=%f\n", #name, (double) results[j++] / (CYCLES * TEST_COUNT)); | |
EACH_FUNC(X) | |
#undef X | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment