Skip to content

Instantly share code, notes, and snippets.

@nixiz
Last active December 2, 2022 08:15
Show Gist options
  • Save nixiz/c3bb8f6029b64f9281ac44cbc119209a to your computer and use it in GitHub Desktop.
Save nixiz/c3bb8f6029b64f9281ac44cbc119209a to your computer and use it in GitHub Desktop.
cache friendly programming best practices with google/benchmark results
#include <benchmark/benchmark.h>
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <chrono>
#include <iterator>
#include <execution>
static void count_if_random(benchmark::State& state)
{
std::vector<int> v(65536);
std::generate(std::begin(v), std::end(v), []
{
return (rand() % 2) ? 1 : -1;
});
for (auto _ : state) {
auto count = std::count_if(
std::execution::seq,
std::begin(v), std::end(v),
[](auto x)
{
return x > 0;
});
benchmark::DoNotOptimize(count);
benchmark::ClobberMemory();
}
}
BENCHMARK(count_if_random)->Unit(benchmark::kMicrosecond);
static void count_if_sequenced(benchmark::State& state)
{
std::vector<int> v(65536);
std::generate(std::begin(v), std::end(v), [n = 0]() mutable {
return (++n % 2) ? 1 : -1;
});
for (auto _ : state) {
auto count = std::count_if(
std::execution::seq,
std::begin(v), std::end(v),
[](auto x)
{
return x > 0;
});
benchmark::DoNotOptimize(count);
benchmark::ClobberMemory();
}
}
BENCHMARK(count_if_sequenced)->Unit(benchmark::kMicrosecond);
static void count_if_sorted(benchmark::State& state)
{
std::vector<int> v(65536);
std::generate(std::begin(v), std::end(v),
[] {
return (rand() % 2) ? 1 : -1;
});
std::sort(v.begin(), v.end());
for (auto _ : state) {
auto count = std::count_if(
std::execution::seq,
std::begin(v), std::end(v),
[](auto x)
{
return x > 0;
});
benchmark::DoNotOptimize(count);
benchmark::ClobberMemory();
}
}
BENCHMARK(count_if_sorted)->Unit(benchmark::kMicrosecond);
namespace branch_prediction
{
struct price
{
virtual ~price() {}
virtual float getPrice() const noexcept { return 1.0f; }
};
struct cheap : public price
{
float getPrice() const noexcept override { return 2.0f; }
};
struct expensive : public price
{
float getPrice() const noexcept override { return 3.14159f; }
};
} // namespace branch_prediction
static void virtual_call_sequenced(benchmark::State& state)
{
using namespace branch_prediction;
std::vector<price*> pricelist;
std::fill_n(std::back_inserter(pricelist), 10000, new price);
std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
std::fill_n(std::back_inserter(pricelist), 10000, new expensive);
for (auto _ : state) {
float sum = 0;
for (auto* p : pricelist) {
benchmark::DoNotOptimize(sum += p->getPrice());
}
benchmark::ClobberMemory();
}
}
BENCHMARK(virtual_call_sequenced)->Unit(benchmark::kMicrosecond);
static void virtual_call_shuffled(benchmark::State& state)
{
using namespace branch_prediction;
std::vector<price*> pricelist;
std::random_device rd;
std::mt19937 g{ rd() };
std::fill_n(std::back_inserter(pricelist), 10000, new price);
std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
std::fill_n(std::back_inserter(pricelist), 10000, new expensive);
std::shuffle(pricelist.begin(), pricelist.end(), g);
for (auto _ : state) {
float sum = 0;
for (auto* p : pricelist) {
benchmark::DoNotOptimize(sum += p->getPrice());
}
benchmark::ClobberMemory();
}
}
BENCHMARK(virtual_call_shuffled)->Unit(benchmark::kMicrosecond);
#include <limits>
#include <benchmark/benchmark.h>
#include <type_traits>
#include <chrono>
#include <random>
#include <map>
#include <string>
#include <sstream>
#include <iostream>
#include <iomanip>
#include <execution>
namespace cpu_cache_misses {
enum class traversal_order {
row_major,
column_major
};
template <unsigned int size>
struct random_map
{
unsigned int map[size] = { 0 };
random_map() {
srand(static_cast<unsigned int>(time(NULL)));
std::generate(std::execution::par, std::begin(map), std::end(map), [] { return std::rand() % size; });
}
unsigned int operator()() const {
return map[++indx % size];
}
private:
mutable unsigned long long indx = 0;
};
const static random_map<8 << 10> random_numbers;
template <class T>
class matrix {
public:
using type_t = typename std::decay<T>::type;
using msize_t = unsigned long long;
explicit matrix(msize_t matrix_size)
{
reset(matrix_size);
}
~matrix()
{
if (data_ != nullptr) release();
}
void reset(msize_t matrix_size)
{
if (data_ != nullptr) release();
matrix_dim = static_cast<msize_t>(sqrt(matrix_size));
const auto dim_size = matrix_dim % 2 == 0 ? matrix_dim : matrix_dim + 1;
row_size = dim_size;
column_size = dim_size;
// check if matrix is equal sized
assert(row_size == column_size);
data_ = new type_t * [row_size];
for (msize_t i = 0; i < row_size; i++)
{
data_[i] = new type_t[column_size];
// set initial values
for (msize_t j = 0; j < column_size; j++)
{
// set random initial data
data_[i][j] = static_cast<type_t>(random_numbers());
}
}
}
msize_t size() const {
return matrix_dim * matrix_dim;
}
msize_t rows() const {
return row_size;
}
msize_t columns() const {
return column_size;
}
type_t* operator[](msize_t index) {
return data_[index];
}
type_t* operator[](msize_t index) const {
return data_[index];
}
protected:
void release() {
// delete allocated source
for (size_t i = 0; i < row_size; i++)
{
delete[] data_[i];
}
delete[] data_;
data_ = nullptr;
}
private:
type_t** data_ = nullptr;
msize_t row_size{ 0 };
msize_t column_size{ 0 };
msize_t matrix_dim{ 0 };
};
} // namespace cpu_cache_misses
using namespace cpu_cache_misses;
static void sum_row_major(benchmark::State& state)
{
using type = matrix<uint8_t>::msize_t;
const type matrix_size = (1024 * 1024) * state.range(0);
static matrix<uint8_t> mat(matrix_size);
if (matrix_size != mat.size()) {
mat.reset(matrix_size);
}
for (auto _ : state) {
type sum = 0;
for (type r = 0; r < mat.rows(); ++r) {
for (type c = 0; c < mat.columns(); ++c) {
benchmark::DoNotOptimize(sum += mat[r][c]);
}
}
benchmark::ClobberMemory();
}
}
BENCHMARK(sum_row_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
BENCHMARK(sum_row_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);
using namespace cpu_cache_misses;
static void sum_column_major(benchmark::State& state)
{
using type = matrix<uint8_t>::msize_t;
const type matrix_size = (1024 * 1024) * state.range(0);
static matrix<uint8_t> mat(matrix_size);
if (matrix_size != mat.size()) {
mat.reset(matrix_size);
}
for (auto _ : state) {
type sum = 0;
for (type c = 0; c < mat.columns(); ++c) {
for (type r = 0; r < mat.rows(); ++r) {
benchmark::DoNotOptimize(sum += mat[r][c]);
}
}
benchmark::ClobberMemory();
}
}
BENCHMARK(sum_column_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
BENCHMARK(sum_column_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);
#include <benchmark/benchmark.h>
#include <atomic>
#include <thread>
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <chrono>
template <typename input_t>
void work(input_t& a)
{
for (int i = 0; i < 10000000; ++i)
a++;
}
void work_with_sentinel(std::atomic<int>& a)
{
thread_local unsigned int sentinel = 0;
for (int i = 0; i < 10000000; ++i)
++sentinel;
a += sentinel;
}
namespace sharing_single_atomic
{
inline int test_1()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work(a); });
t1.join();
return a;
}
inline int test_2()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work(a); });
std::thread t2([&] { work(a); });
t1.join(); t2.join();
return a;
}
inline int test_3()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work(a); });
std::thread t2([&] { work(a); });
std::thread t3([&] { work(a); });
t1.join(); t2.join(); t3.join();
return a;
}
inline int test_4()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work(a); });
std::thread t2([&] { work(a); });
std::thread t3([&] { work(a); });
std::thread t4([&] { work(a); });
t1.join(); t2.join(); t3.join(); t4.join();
return a;
}
inline int test_1_sentinel()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work_with_sentinel(a); });
t1.join();
return a;
}
inline int test_2_sentinel()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work_with_sentinel(a); });
std::thread t2([&] { work_with_sentinel(a); });
t1.join(); t2.join();
return a;
}
inline int test_3_sentinel()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work_with_sentinel(a); });
std::thread t2([&] { work_with_sentinel(a); });
std::thread t3([&] { work_with_sentinel(a); });
t1.join(); t2.join(); t3.join();
return a;
}
inline int test_4_sentinel()
{
std::atomic<int> a; a = 0;
std::thread t1([&] { work_with_sentinel(a); });
std::thread t2([&] { work_with_sentinel(a); });
std::thread t3([&] { work_with_sentinel(a); });
std::thread t4([&] { work_with_sentinel(a); });
t1.join(); t2.join(); t3.join(); t4.join();
return a;
}
}
static void test_sharing_single_atomic(benchmark::State& state, int test_num)
{
using namespace sharing_single_atomic;
int (*fun)() = nullptr;
switch (test_num)
{
case 1:
fun = test_1;
break;
case 2:
fun = test_2;
break;
case 3:
fun = test_3;
break;
case 4:
fun = test_4;
break;
case 5:
fun = test_1_sentinel;
break;
case 6:
fun = test_2_sentinel;
break;
case 7:
fun = test_3_sentinel;
break;
case 8:
fun = test_4_sentinel;
break;
default:
{
std::cerr << "undefined test number given!" << std::endl;
std::terminate();
}
break;
}
for (auto _ : state) {
benchmark::DoNotOptimize(fun());
}
}
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded_with_sentinels, 5)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded_with_sentinels, 6)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded_with_sentinels, 7)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded_with_sentinels, 8)->UseRealTime()->Unit(benchmark::kMillisecond);
namespace sharing_atomics_in_one_chache_line
{
inline int test_2()
{
std::atomic<int> a; a = 0;
std::atomic<int> b; b = 0;
std::thread t1([&] { work(a); });
std::thread t2([&] { work(b); });
t1.join(); t2.join();
return a + b;
}
inline int test_3()
{
std::atomic<int> a; a = 0;
std::atomic<int> b; b = 0;
std::atomic<int> c; c = 0;
std::thread t1([&] { work(a); });
std::thread t2([&] { work(b); });
std::thread t3([&] { work(c); });
t1.join(); t2.join(); t3.join();
return a + b + c;
}
inline int test_4()
{
std::atomic<int> a; a = 0; // &a: 0x...b2f7c0
std::atomic<int> b; b = 0; // &b: 0x...b2f7c4
std::atomic<int> c; c = 0; // &c: 0x...b2f7c8
std::atomic<int> d; d = 0; // &d: 0x...b2f7cc
std::thread t1([&] { work(a); });
std::thread t2([&] { work(b); });
std::thread t3([&] { work(c); });
std::thread t4([&] { work(d); });
t1.join(); t2.join(); t3.join(); t4.join();
return a + b + c + d;
}
}
static void test_sharing_atomics_in_one_chache_line(benchmark::State& state, int test_num)
{
using namespace sharing_atomics_in_one_chache_line;
int (*fun)() = nullptr;
switch (test_num)
{
case 2:
fun = test_2;
break;
case 3:
fun = test_3;
break;
case 4:
fun = test_4;
break;
default:
{
std::cerr << "undefined test number given!" << std::endl;
std::terminate();
}
break;
}
for (auto _ : state) {
benchmark::DoNotOptimize(fun());
}
}
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
namespace false_sharing_resolved
{
// aligned type with the same size of cache line
struct alignas(64) aligned_type
{
std::atomic<int> val;
};
static_assert(sizeof(aligned_type) == 64, "structure alignment must be same as cache line");
inline int test_1()
{
aligned_type a; a.val = 0; // &a: 0x...4ff240
std::thread t1([&a] { work(a.val); });
t1.join();
return a.val;
}
inline int test_2()
{
aligned_type a; a.val = 0; // &a: 0x...4ff240
aligned_type b; b.val = 0; // &b: 0x...4ff280
std::thread t1([&a] { work(a.val); });
std::thread t2([&b] { work(b.val); });
t1.join(); t2.join();
return a.val + b.val;
}
inline int test_3()
{
aligned_type a; a.val = 0; // &a: 0x...4ff240
aligned_type b; b.val = 0; // &b: 0x...4ff280
aligned_type c; c.val = 0; // &c: 0x...4ff2c0
std::thread t1([&a] { work(a.val); });
std::thread t2([&b] { work(b.val); });
std::thread t3([&c] { work(c.val); });
t1.join(); t2.join(); t3.join();
return a.val + b.val + c.val;
}
inline int test_4()
{
aligned_type a; a.val = 0; // &a: 0x...4ff240
aligned_type b; b.val = 0; // &b: 0x...4ff280
aligned_type c; c.val = 0; // &c: 0x...4ff2c0
aligned_type d; d.val = 0; // &d: 0x...4ff300
std::thread t1([&a] { work(a.val); });
std::thread t2([&b] { work(b.val); });
std::thread t3([&c] { work(c.val); });
std::thread t4([&d] { work(d.val); });
t1.join(); t2.join(); t3.join(); t4.join();
return a.val + b.val + c.val + d.val;
}
}
static void test_false_sharing_resolved(benchmark::State& state, int test_num)
{
using namespace false_sharing_resolved;
int (*fun)() = nullptr;
switch (test_num)
{
case 1:
fun = test_1;
break;
case 2:
fun = test_2;
break;
case 3:
fun = test_3;
break;
case 4:
fun = test_4;
break;
default:
{
std::cerr << "undefined test number given!" << std::endl;
std::terminate();
}
break;
}
for (auto _ : state) {
benchmark::DoNotOptimize(fun());
}
}
BENCHMARK_CAPTURE(test_false_sharing_resolved, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment