Besides what /u/adnukator said:
- Instead of your nested loop, tell the benchmark framework what arguments you need.
- Lookup tables perform better on monotonically increasing numbers. I think a better comparison would be a randomized array of values from which to pick arguments.
Here's what the benchmark looks like:
#include <array>
#include <algorithm>
#include <benchmark/benchmark.h>
#include <cstdint>
#include <math.h>
#include <limits.h>
#include <numeric>
#include <random>
std::array<int, (INT_MAX>>11)> arr;
int int_log2(uint32_t x) { return 31 - __builtin_clz(x|1); }
constexpr int mutliplier = 10;
constexpr int begin = 0;
constexpr int end = INT_MAX;
int slow_digit_count(uint32_t x) {
return x > 0 ? (int) log10 ((double) x) + 1 : 1;
}
static void BaselineCMath(benchmark::State& state) {
// Code inside this loop is measured repeatedly
auto arg = arr[state.range(0)%arr.size()];
for (auto _ : state) {
benchmark::DoNotOptimize(slow_digit_count(arg));
}
}
// Register the function as a benchmark
// BENCHMARK(BaselineCMath)->RangeMultiplier(mutliplier)->Range(begin, end);
int slow_table_lookup(uint32_t x)
{
static uint32_t table[] = {9, 99, 999, 9999, 99999,
999999, 9999999, 99999999, 999999999};
int y = (9 * int_log2(x)) >> 5;
y += x > table[y];
return y + 1;
}
static void SlowTableLookup(benchmark::State& state) {
// Code inside this loop is measured repeatedly
auto arg = arr[state.range(0)%arr.size()];
for (auto _ : state) {
benchmark::DoNotOptimize(slow_table_lookup(arg));
}
}
// Register the function as a benchmark
BENCHMARK(SlowTableLookup) ->RangeMultiplier(mutliplier)->Range(begin, end);
int fast_digit_count(uint32_t x) {
static uint64_t table[] = {
4294967296, 8589934582, 8589934582, 8589934582, 12884901788,
12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
42949672960, 42949672960};
return (x + table[int_log2(x)]) >> 32;
}
static void FastTableLookup(benchmark::State& state) {
auto arg = arr[state.range(0)%arr.size()];
for (auto _ : state) {
benchmark::DoNotOptimize(fast_digit_count(arg));
}
}
// Register the function as a benchmark
BENCHMARK(FastTableLookup)->RangeMultiplier(mutliplier)->Range(begin, end);
int straight_lut(uint32_t x) {
return (x < 10 ? 1 :
(x < 100 ? 2 :
(x < 1000 ? 3 :
(x < 10000 ? 4 :
(x < 100000 ? 5 :
(x < 1000000 ? 6 :
(x < 10000000 ? 7 :
(x < 100000000 ? 8 :
(x < 1000000000 ? 9 :
10)))))))));
}
static void StraightLookupTable(benchmark::State& state) {
// Code inside this loop is measured repeatedly
auto arg = arr[state.range(0)%arr.size()];
for (auto _ : state) {
benchmark::DoNotOptimize(straight_lut(arg));
}
}
// Register the function as a benchmark
BENCHMARK(StraightLookupTable)->RangeMultiplier(mutliplier)->Range(begin, end);
int main(int argc, char **argv) {
std::iota(arr.begin(), arr.end(), 0);
std::shuffle(arr.begin(), arr.end(), std::random_device{});
::benchmark ::Initialize(&argc, argv);
if (::benchmark ::ReportUnrecognizedArguments(argc, argv))
return 1;
::benchmark ::RunSpecifiedBenchmarks();
}
You can tune the benchmark parameters with the global variables at the top.
Compiling that with g++ -O3 -march=sandybridge bench.cpp -lbenchmark
produces the following table:
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
SlowTableLookup/0 0.603 ns 0.603 ns 1000000000
SlowTableLookup/1 0.597 ns 0.597 ns 1000000000
SlowTableLookup/10 0.597 ns 0.597 ns 1000000000
SlowTableLookup/100 0.597 ns 0.596 ns 1000000000
SlowTableLookup/1000 0.598 ns 0.598 ns 1000000000
SlowTableLookup/10000 0.605 ns 0.605 ns 1000000000
SlowTableLookup/100000 0.605 ns 0.605 ns 1000000000
SlowTableLookup/1000000 0.597 ns 0.597 ns 1000000000
SlowTableLookup/10000000 0.600 ns 0.600 ns 1000000000
SlowTableLookup/100000000 0.599 ns 0.599 ns 1000000000
SlowTableLookup/1000000000 0.596 ns 0.595 ns 1000000000
SlowTableLookup/2147483647 0.602 ns 0.601 ns 1000000000
FastTableLookup/0 0.297 ns 0.297 ns 1000000000
FastTableLookup/1 0.303 ns 0.303 ns 1000000000
FastTableLookup/10 0.297 ns 0.297 ns 1000000000
FastTableLookup/100 0.298 ns 0.298 ns 1000000000
FastTableLookup/1000 0.299 ns 0.299 ns 1000000000
FastTableLookup/10000 0.298 ns 0.297 ns 1000000000
FastTableLookup/100000 0.298 ns 0.298 ns 1000000000
FastTableLookup/1000000 0.300 ns 0.300 ns 1000000000
FastTableLookup/10000000 0.298 ns 0.298 ns 1000000000
FastTableLookup/100000000 0.296 ns 0.296 ns 1000000000
FastTableLookup/1000000000 0.303 ns 0.302 ns 1000000000
FastTableLookup/2147483647 0.301 ns 0.301 ns 1000000000
StraightLookupTable/0 1.50 ns 1.50 ns 470392906
StraightLookupTable/1 1.50 ns 1.50 ns 473873121
StraightLookupTable/10 1.49 ns 1.49 ns 469648210
StraightLookupTable/100 1.49 ns 1.49 ns 465237086
StraightLookupTable/1000 1.49 ns 1.49 ns 465127451
StraightLookupTable/10000 1.49 ns 1.49 ns 473431062
StraightLookupTable/100000 1.49 ns 1.49 ns 473693746
StraightLookupTable/1000000 1.49 ns 1.49 ns 471659926
StraightLookupTable/10000000 1.51 ns 1.51 ns 465723524
StraightLookupTable/100000000 1.49 ns 1.49 ns 473503358
StraightLookupTable/1000000000 1.49 ns 1.49 ns 470011579
StraightLookupTable/2147483647 1.48 ns 1.48 ns 463515021
Observations:
- No quick-bench link, because of the custom
main()
. - The
StriaghtLookupTable
's time somewhat depends on the size of the array. With only 100'000 elements in the array, it has taken the StraightLookupTable ~1.2ns instead of 1.5ns. An array of only 100 elements meant theStraightLookupTable
was approachingFastTableLookup
. The other two remained pretty consistent.
So let's get rid of the array and just use an RNG to generate an integer of input on demand.
#include <array>
#include <benchmark/benchmark.h>
#include <cstdint>
#include <math.h>
#include <limits.h>
#include <random>
typedef std::linear_congruential_engine<
uint64_t, 6364136223846793005U, 1442695040888963407U, 0U>
knuth_lcg; /* Knuth's preferred 64-bit LCG */
std::random_device rdev;
uint64_t seed = (uint64_t(rdev()) << 32) | rdev();
knuth_lcg rng{seed};
int int_log2(uint32_t x) { return 31 - __builtin_clz(x|1); }
int slow_digit_count(uint32_t x) {
return x > 0 ? (int) log10 ((double) x) + 1 : 1;
}
static void BaselineCMath(benchmark::State& state) {
// Code inside this loop is measured repeatedly
for (auto _ : state) {
for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(slow_digit_count(rng()));
}
}
// Register the function as a benchmark
// BENCHMARK(BaselineCMath);
int slow_table_lookup(uint32_t x)
{
static uint32_t table[] = {9, 99, 999, 9999, 99999,
999999, 9999999, 99999999, 999999999};
int y = (9 * int_log2(x)) >> 5;
y += x > table[y];
return y + 1;
}
static void SlowTableLookup(benchmark::State& state) {
// Code inside this loop is measured repeatedly
for (auto _ : state) {
for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(slow_table_lookup(rng()));
}
}
// Register the function as a benchmark
BENCHMARK(SlowTableLookup) ;
int fast_digit_count(uint32_t x) {
static uint64_t table[] = {
4294967296, 8589934582, 8589934582, 8589934582, 12884901788,
12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
42949672960, 42949672960};
return (x + table[int_log2(x)]) >> 32;
}
static void FastTableLookup(benchmark::State& state) {
for (auto _ : state) {
for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(fast_digit_count(rng()));
}
}
// Register the function as a benchmark
BENCHMARK(FastTableLookup);
int straight_lut(uint32_t x) {
return (x < 10 ? 1 :
(x < 100 ? 2 :
(x < 1000 ? 3 :
(x < 10000 ? 4 :
(x < 100000 ? 5 :
(x < 1000000 ? 6 :
(x < 10000000 ? 7 :
(x < 100000000 ? 8 :
(x < 1000000000 ? 9 :
10)))))))));
}
static void StraightLookupTable(benchmark::State& state) {
// Code inside this loop is measured repeatedly
for (auto _ : state) {
for(int i = 0; i<1000; i++)benchmark::DoNotOptimize(straight_lut(rng()));
}
}
// Register the function as a benchmark
BENCHMARK(StraightLookupTable);
int main(int argc, char **argv) {
::benchmark ::Initialize(&argc, argv);
if (::benchmark ::ReportUnrecognizedArguments(argc, argv))
return 1;
::benchmark ::RunSpecifiedBenchmarks();
}
--------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------
SlowTableLookup 3158 ns 3157 ns 221601
FastTableLookup 2772 ns 2771 ns 253092
StraightLookupTable 3164 ns 3162 ns 218485
That's what I get after removing the array. I believe the random input is a fair game. Also, if you're optimizing this kind of thing, I think it's a safe bet that you have something else as well fighting for your L0 cache. In the end, don't trust a benchmark you didn't fake yourself.
Note: The second version should be possible to run in quick-bench.