Skip to content

Instantly share code, notes, and snippets.

@bstaletic
Created June 4, 2021 16:32
Show Gist options
  • Save bstaletic/4674d95f91971c8b7444b1976753205c to your computer and use it in GitHub Desktop.
Save bstaletic/4674d95f91971c8b7444b1976753205c to your computer and use it in GitHub Desktop.
digit counting benchmark

Besides what /u/adnukator said:

  1. Instead of your nested loop, tell the benchmark framework what arguments you need.
  2. Lookup tables perform better on monotonically increasing numbers. I think a better comparison would be a randomized array of values from which to pick arguments.

Here's what the benchmark looks like:

#include <array>
#include <algorithm>
#include <benchmark/benchmark.h>
#include <cstdint>
#include <math.h>
#include <limits.h>
#include <numeric>
#include <random>
std::array<int, (INT_MAX>>11)> arr;
int int_log2(uint32_t x) { return 31 - __builtin_clz(x|1); }
constexpr int mutliplier = 10;
constexpr int begin = 0;
constexpr int end = INT_MAX;


int slow_digit_count(uint32_t x) {
	return x > 0 ? (int) log10 ((double) x) + 1 : 1;
}

static void BaselineCMath(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	auto arg = arr[state.range(0)%arr.size()];
	for (auto _ : state) {
		benchmark::DoNotOptimize(slow_digit_count(arg));
	}
}
// Register the function as a benchmark
// BENCHMARK(BaselineCMath)->RangeMultiplier(mutliplier)->Range(begin, end);

int slow_table_lookup(uint32_t x)
{
	static uint32_t table[] = {9, 99, 999, 9999, 99999,
		999999, 9999999, 99999999, 999999999};
	int y = (9 * int_log2(x)) >> 5;
	y += x > table[y];
	return y + 1;
}

static void SlowTableLookup(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	auto arg = arr[state.range(0)%arr.size()];
	for (auto _ : state) {
		benchmark::DoNotOptimize(slow_table_lookup(arg));
	}
}
// Register the function as a benchmark
BENCHMARK(SlowTableLookup) ->RangeMultiplier(mutliplier)->Range(begin, end);

int fast_digit_count(uint32_t x) {
	static uint64_t table[] = {
		4294967296,  8589934582,  8589934582,  8589934582,  12884901788,
		12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
		21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
		25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
		34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
		38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
		42949672960, 42949672960};
	return (x + table[int_log2(x)]) >> 32;
}

static void FastTableLookup(benchmark::State& state) {
	auto arg = arr[state.range(0)%arr.size()];
	for (auto _ : state) {
		benchmark::DoNotOptimize(fast_digit_count(arg));
	}
}
// Register the function as a benchmark
BENCHMARK(FastTableLookup)->RangeMultiplier(mutliplier)->Range(begin, end);


int straight_lut(uint32_t x) {
	return (x < 10 ? 1 :
			(x < 100 ? 2 :
			 (x < 1000 ? 3 :
			  (x < 10000 ? 4 :
			   (x < 100000 ? 5 :
			    (x < 1000000 ? 6 :
			     (x < 10000000 ? 7 :
			      (x < 100000000 ? 8 :
			       (x < 1000000000 ? 9 :
				10)))))))));
}

static void StraightLookupTable(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	auto arg = arr[state.range(0)%arr.size()];
	for (auto _ : state) {
		benchmark::DoNotOptimize(straight_lut(arg));
	}
}
// Register the function as a benchmark
BENCHMARK(StraightLookupTable)->RangeMultiplier(mutliplier)->Range(begin, end);
int main(int argc, char **argv) {
	std::iota(arr.begin(), arr.end(), 0);
	std::shuffle(arr.begin(), arr.end(), std::random_device{});
	::benchmark ::Initialize(&argc, argv);
	if (::benchmark ::ReportUnrecognizedArguments(argc, argv))
		return 1;
	::benchmark ::RunSpecifiedBenchmarks();
}

You can tune the benchmark parameters with the global variables at the top.

Compiling that with g++ -O3 -march=sandybridge bench.cpp -lbenchmark produces the following table:

-------------------------------------------------------------------------
Benchmark                               Time             CPU   Iterations
-------------------------------------------------------------------------
SlowTableLookup/0                   0.603 ns        0.603 ns   1000000000
SlowTableLookup/1                   0.597 ns        0.597 ns   1000000000
SlowTableLookup/10                  0.597 ns        0.597 ns   1000000000
SlowTableLookup/100                 0.597 ns        0.596 ns   1000000000
SlowTableLookup/1000                0.598 ns        0.598 ns   1000000000
SlowTableLookup/10000               0.605 ns        0.605 ns   1000000000
SlowTableLookup/100000              0.605 ns        0.605 ns   1000000000
SlowTableLookup/1000000             0.597 ns        0.597 ns   1000000000
SlowTableLookup/10000000            0.600 ns        0.600 ns   1000000000
SlowTableLookup/100000000           0.599 ns        0.599 ns   1000000000
SlowTableLookup/1000000000          0.596 ns        0.595 ns   1000000000
SlowTableLookup/2147483647          0.602 ns        0.601 ns   1000000000
FastTableLookup/0                   0.297 ns        0.297 ns   1000000000
FastTableLookup/1                   0.303 ns        0.303 ns   1000000000
FastTableLookup/10                  0.297 ns        0.297 ns   1000000000
FastTableLookup/100                 0.298 ns        0.298 ns   1000000000
FastTableLookup/1000                0.299 ns        0.299 ns   1000000000
FastTableLookup/10000               0.298 ns        0.297 ns   1000000000
FastTableLookup/100000              0.298 ns        0.298 ns   1000000000
FastTableLookup/1000000             0.300 ns        0.300 ns   1000000000
FastTableLookup/10000000            0.298 ns        0.298 ns   1000000000
FastTableLookup/100000000           0.296 ns        0.296 ns   1000000000
FastTableLookup/1000000000          0.303 ns        0.302 ns   1000000000
FastTableLookup/2147483647          0.301 ns        0.301 ns   1000000000
StraightLookupTable/0                1.50 ns         1.50 ns    470392906
StraightLookupTable/1                1.50 ns         1.50 ns    473873121
StraightLookupTable/10               1.49 ns         1.49 ns    469648210
StraightLookupTable/100              1.49 ns         1.49 ns    465237086
StraightLookupTable/1000             1.49 ns         1.49 ns    465127451
StraightLookupTable/10000            1.49 ns         1.49 ns    473431062
StraightLookupTable/100000           1.49 ns         1.49 ns    473693746
StraightLookupTable/1000000          1.49 ns         1.49 ns    471659926
StraightLookupTable/10000000         1.51 ns         1.51 ns    465723524
StraightLookupTable/100000000        1.49 ns         1.49 ns    473503358
StraightLookupTable/1000000000       1.49 ns         1.49 ns    470011579
StraightLookupTable/2147483647       1.48 ns         1.48 ns    463515021

Observations:

  1. No quick-bench link, because of the custom main().
  2. The StriaghtLookupTable's time somewhat depends on the size of the array. With only 100'000 elements in the array, it has taken the StraightLookupTable ~1.2ns instead of 1.5ns. An array of only 100 elements meant the StraightLookupTable was approaching FastTableLookup. The other two remained pretty consistent.

So let's get rid of the array and just use an RNG to generate an integer of input on demand.

#include <array>
#include <benchmark/benchmark.h>
#include <cstdint>
#include <math.h>
#include <limits.h>
#include <random>

typedef std::linear_congruential_engine<
uint64_t, 6364136223846793005U, 1442695040888963407U, 0U>
knuth_lcg;  /* Knuth's preferred 64-bit LCG */
std::random_device rdev;
uint64_t       seed = (uint64_t(rdev()) << 32) | rdev();
knuth_lcg rng{seed};

int int_log2(uint32_t x) { return 31 - __builtin_clz(x|1); }

int slow_digit_count(uint32_t x) {
	return x > 0 ? (int) log10 ((double) x) + 1 : 1;
}

static void BaselineCMath(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	for (auto _ : state) {
		for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(slow_digit_count(rng()));
	}
}
// Register the function as a benchmark
// BENCHMARK(BaselineCMath);

int slow_table_lookup(uint32_t x)
{
	static uint32_t table[] = {9, 99, 999, 9999, 99999,
		999999, 9999999, 99999999, 999999999};
	int y = (9 * int_log2(x)) >> 5;
	y += x > table[y];
	return y + 1;
}

static void SlowTableLookup(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	for (auto _ : state) {
		for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(slow_table_lookup(rng()));
	}
}
// Register the function as a benchmark
BENCHMARK(SlowTableLookup) ;

int fast_digit_count(uint32_t x) {
	static uint64_t table[] = {
		4294967296,  8589934582,  8589934582,  8589934582,  12884901788,
		12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
		21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
		25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
		34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
		38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
		42949672960, 42949672960};
	return (x + table[int_log2(x)]) >> 32;
}

static void FastTableLookup(benchmark::State& state) {
	for (auto _ : state) {
		for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(fast_digit_count(rng()));
	}
}
// Register the function as a benchmark
BENCHMARK(FastTableLookup);


int straight_lut(uint32_t x) {
	return (x < 10 ? 1 :
		(x < 100 ? 2 :
		(x < 1000 ? 3 :
		(x < 10000 ? 4 :
		(x < 100000 ? 5 :
		(x < 1000000 ? 6 :
		(x < 10000000 ? 7 :
		(x < 100000000 ? 8 :
		(x < 1000000000 ? 9 :
		10)))))))));
}

static void StraightLookupTable(benchmark::State& state) {
	// Code inside this loop is measured repeatedly
	for (auto _ : state) {
		for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(straight_lut(rng()));
	}
}
// Register the function as a benchmark
BENCHMARK(StraightLookupTable);
int main(int argc, char **argv) {
	::benchmark ::Initialize(&argc, argv);
	if (::benchmark ::ReportUnrecognizedArguments(argc, argv))
		return 1;
	::benchmark ::RunSpecifiedBenchmarks();
}

--------------------------------------------------------------
Benchmark                    Time             CPU   Iterations
--------------------------------------------------------------
SlowTableLookup           3158 ns         3157 ns       221601
FastTableLookup           2772 ns         2771 ns       253092
StraightLookupTable       3164 ns         3162 ns       218485

That's what I get after removing the array. I believe the random input is a fair game. Also, if you're optimizing this kind of thing, I think it's a safe bet that you have something else as well fighting for your L0 cache. In the end, don't trust a benchmark you didn't fake yourself.

Note: The second version should be possible to run in quick-bench.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment