Skip to content

Instantly share code, notes, and snippets.

@dead-claudia
Last active July 29, 2022 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dead-claudia/2023261ce67aafedcaecf1fc025365d8 to your computer and use it in GitHub Desktop.
Save dead-claudia/2023261ce67aafedcaecf1fc025365d8 to your computer and use it in GitHub Desktop.
Utility to compare various ways of counting zero and (really) non-zero bytes, including exhaustive tests
// Note: there are some Clang-isms here. To compile with GCC, define `__builtin_readcyclecounter`
// to read from something like x86's `rdtsc` and return a 64-bit integer with the cycle count. It's
// always manipulated relative to a prior call, so it doesn't matter if it wraps around once in the
// process.
#define RUN_ONLY_TESTS 1
#define RUN_ONLY_BENCHMARK 2
// Set to RUN_ONLY_TESTS to run only tests, RUN_ONLY_BENCHMARK to run only the benchmark, or unset
// to run both.
// #define RUN_VARIANT RUN_ONLY_BENCHMARK
// #define RUN_VARIANT RUN_ONLY_TESTS
#define BENCHMARK_ROUNDS 1
#define _GNU_SOURCE
#include "sched.h"
#include "stdint.h"
#include "stdio.h"
#include "stdlib.h"
#include "time.h"
#include "unistd.h"
// Very branchy, but follows the definition very closely.
static uint32_t get_expected(uint32_t x) {
if ((x & 0xFFFFFFFF) == 0) return 0;
if ((x & 0xFFFFFF00) == 0) return 1;
if ((x & 0xFFFF00FF) == 0) return 1;
if ((x & 0xFF00FFFF) == 0) return 1;
if ((x & 0x00FFFFFF) == 0) return 1;
if ((x & 0xFFFF0000) == 0) return 2;
if ((x & 0xFF0000FF) == 0) return 2;
if ((x & 0x0000FFFF) == 0) return 2;
if ((x & 0x00FFFF00) == 0) return 2;
if ((x & 0xFF00FF00) == 0) return 2;
if ((x & 0x00FF00FF) == 0) return 2;
if ((x & 0x000000FF) == 0) return 3;
if ((x & 0x0000FF00) == 0) return 3;
if ((x & 0x00FF0000) == 0) return 3;
if ((x & 0xFF000000) == 0) return 3;
return 4;
}
uint32_t naive(uint32_t x) {
x |= x >> 4;
x |= x >> 2;
x |= x >> 1;
x &= 0x01010101;
x += x >> 16;
x += x >> 8;
return x & 255;
}
uint32_t naive2(uint32_t x) {
return ((x >> 24) != 0) + ((x >> 16 & 0xFF) != 0) + ((x >> 8 & 0xFF) != 0) + ((x & 0xFF) != 0);
}
uint32_t opt1(uint32_t x) {
x |= x >> 4;
x |= x >> 2;
x |= x >> 1;
return ((x & 0x01010101) * 0x01010101) >> 24;
}
uint32_t opt2(uint32_t x) {
x |= (x & 0x7F7F7F7F) + 0x7F7F7F7F;
x >>= 7;
return ((x & 0x01010101) * 0x01010101) >> 24;
}
uint32_t opt3(uint32_t x) {
x |= (x & 0x7F7F7F7F) + 0x7F7F7F7F;
return __builtin_popcount(x & 0x80808080);
}
static uint32_t mod255(uint32_t x) {
return (x + x / 255) & 255;
}
#define countmore(x, n) \
(((((x) & ~0UL / 255 * 127) + ~0UL / 255 * (127 - (n)) | (x)) & ~0UL / 255 * 128) / 128 % 255)
#define countmore_with_optimized_mod_255(x, n) \
mod255(((((x) & ~0UL / 255 * 127) + ~0UL / 255 * (127 - (n)) | (x)) & ~0UL / 255 * 128) / 128)
uint32_t seander(uint32_t x) {
return countmore(x, 0);
}
uint32_t seander_opt(uint32_t x) {
return countmore_with_optimized_mod_255(x, 0);
}
uint32_t zero_easy_naive(uint32_t x) {
return 4 - naive(x);
}
uint32_t zero_naive2(uint32_t x) {
return ((x >> 24) == 0) + ((x >> 16 & 0xFF) == 0) + ((x >> 8 & 0xFF) == 0) + ((x & 0xFF) == 0);
}
uint32_t zero_easy_opt1(uint32_t x) {
return 4 - opt1(x);
}
uint32_t zero_easy_opt2(uint32_t x) {
return 4 - opt2(x);
}
uint32_t zero_easy_opt3(uint32_t x) {
return 4 - opt3(x);
}
#define countless(x, n) \
(((~0UL / 255 * (127 + (n)) - ((x) & ~0UL / 255 * 127)) & ~(x) & ~0UL / 255 * 128) / 128 % 255)
#define countless_with_optimized_mod_255(x, n) \
mod255(((~0UL / 255 * (127 + (n)) - ((x) & ~0UL / 255 * 127)) & ~(x) & ~0UL / 255 * 128) / 128)
uint32_t zero_seander(uint32_t x) {
return countless(x, 1);
}
uint32_t zero_seander_opt(uint32_t x) {
return countless_with_optimized_mod_255(x, 1);
}
static const int MAX_FAILS = 20;
static int fails = 0;
static void fail(char *name, uint32_t i, uint32_t found, uint32_t expect) {
fprintf(
stderr,
"%s(%02x %02x %02x %02x) = %u, expected %u\n",
name,
i >> 24 & 0xFF,
i >> 16 & 0xFF,
i >> 8 & 0xFF,
i >> 0 & 0xFF,
found,
expect);
if (++fails == MAX_FAILS) exit(1);
}
#if !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS
static void test_results() {
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = naive(i);
if (found != expect) fail("naive", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "naive: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "naive tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = naive2(i);
if (found != expect) fail("naive2", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "naive2: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "naive2 tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = opt1(i);
if (found != expect) fail("opt1", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt1: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "opt1 tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = opt2(i);
if (found != expect) fail("opt2", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt2: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "opt2 tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = opt3(i);
if (found != expect) fail("opt3", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt3: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "opt3 tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = get_expected(i);
uint32_t found = seander(i);
if (found != expect) fail("seander", i, found, expect);
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "seander: tested %02x 00 00 00\n", i >> 24);
} while (++i != 0);
}
fprintf(stderr, "seander countmore tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = 4 - get_expected(i);
uint32_t found = zero_seander(i);
if (found != expect) fail("zero_seander", i, found, expect);
if ((i & 0x00FFFFFF) == 0) {
fprintf(stderr, "zero_seander: tested %02x 00 00 00\n", i >> 24);
}
} while (++i != 0);
}
fprintf(stderr, "seander countless tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = 4 - get_expected(i);
uint32_t found = zero_seander_opt(i);
if (found != expect) fail("zero_seander_opt", i, found, expect);
if ((i & 0x00FFFFFF) == 0) {
fprintf(stderr, "zero_seander_opt: tested %02x 00 00 00\n", i >> 24);
}
} while (++i != 0);
}
fprintf(stderr, "seander countless with optimized modulo tested\n");
{
uint32_t i = 0;
do {
uint32_t expect = 4 - get_expected(i);
uint32_t found = zero_naive2(i);
if (found != expect) fail("zero_naive2", i, found, expect);
if ((i & 0x00FFFFFF) == 0) {
fprintf(stderr, "zero_naive2: tested %02x 00 00 00\n", i >> 24);
}
} while (++i != 0);
}
fprintf(stderr, "zero_naive2 tested\n");
}
#endif // !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS
#if !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS
#define TO_STRING_INNER(X) #X
#define TO_STRING(X) TO_STRING_INNER(X)
volatile uint32_t threshold_value = 10;
static double get_cycles_per_op(uint64_t cycles) {
return (double)cycles / (double)((UINT64_C(1) << 32) * BENCHMARK_ROUNDS);
}
static uint32_t control_noop(uint32_t _x) {
asm volatile("");
return 0;
}
static uint32_t control_and(uint32_t x) {
return x & 7;
}
static void test_perf() {
#if defined(RUN_VARIANT) && RUN_VARIANT == RUN_ONLY_BENCHMARK
#define perf_test(F) \
fprintf(stderr, "testing " TO_STRING(F) "\n"); \
for (uint32_t i = 0; i < 1 << 16; i++) { \
uint32_t expect = get_expected(i); \
uint32_t found = F(i); \
if (found != expect) fail(TO_STRING(F), i, found, expect); \
}
#define perf_test_zero(F) \
fprintf(stderr, "testing " TO_STRING(F) "\n"); \
for (uint32_t i = 0; i < 1 << 16; i++) { \
uint32_t expect = 4 - get_expected(i); \
uint32_t found = F(i); \
if (found != expect) fail(TO_STRING(F), i, found, expect); \
}
perf_test(naive);
perf_test(naive2);
perf_test(opt1);
perf_test(opt2);
perf_test(opt3);
perf_test(seander);
perf_test(seander_opt);
perf_test_zero(zero_easy_naive);
perf_test_zero(zero_naive2);
perf_test_zero(zero_easy_opt1);
perf_test_zero(zero_easy_opt2);
perf_test_zero(zero_easy_opt3);
perf_test_zero(zero_seander);
perf_test_zero(zero_seander_opt);
fprintf(stderr, "functions tested\n");
#endif // defined(RUN_VARIANT) && RUN_VARIANT == RUN_ONLY_BENCHMARK
// Don't let it move between CPUs.
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(sched_getcpu(), &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
uint64_t span, start, end;
uint64_t threshold = threshold_value;
#define perf_run(F) \
for (int i = 0; i < 1 << 24; i++) { \
if (__builtin_expect(F(i) > threshold, 0)) { \
fprintf(stderr, "fail " TO_STRING(F)); \
abort(); \
} \
} \
span = 0; \
for (int j = 0; j < BENCHMARK_ROUNDS; j++) { \
start = __builtin_readcyclecounter(); \
uint32_t i = 0; \
do { \
if (__builtin_expect(F(i) > threshold, 0)) { \
fprintf(stderr, "fail " TO_STRING(F)); \
abort(); \
} \
} while (++i != 0); \
end = __builtin_readcyclecounter(); \
span += end - start; \
}
perf_run(control_noop);
fprintf(stderr, " control noop: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(control_and);
fprintf(stderr, " control and: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(naive);
fprintf(stderr, " naive: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(naive2);
fprintf(stderr, " naive2: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(opt1);
fprintf(stderr, " opt1: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(opt2);
fprintf(stderr, " opt2: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(opt3);
fprintf(stderr, " opt3: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(seander);
fprintf(stderr, " seander: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(seander_opt);
fprintf(stderr, " seander_opt: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_easy_naive);
fprintf(stderr, " zero_easy_naive: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_naive2);
fprintf(stderr, " zero_naive2: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_easy_opt1);
fprintf(stderr, " zero_easy_opt1: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_easy_opt2);
fprintf(stderr, " zero_easy_opt2: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_easy_opt3);
fprintf(stderr, " zero_easy_opt3: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_seander);
fprintf(stderr, " zero_seander: %.4f cycles/op\n", get_cycles_per_op(span));
perf_run(zero_seander_opt);
fprintf(stderr, "zero_seander_opt: %.4f cycles/op\n", get_cycles_per_op(span));
}
#endif // !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS
int main() {
#if !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS
test_results();
#endif // !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS
#if !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS
test_perf();
#endif // !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment