nixiz/cache_friendly_programming.cpp

## cache_friendly_programming.cpp
#include <benchmark/benchmark.h>
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <chrono>
#include <iterator>
#include <execution>

static void count_if_random(benchmark::State& state)
{
  std::vector<int> v(65536);
  std::generate(std::begin(v), std::end(v), []
                {
                  return (rand() % 2) ? 1 : -1;
                });
  for (auto _ : state) {
    auto count = std::count_if(
      std::execution::seq,
      std::begin(v), std::end(v),
      [](auto x)
      {
        return x > 0;
      });
    benchmark::DoNotOptimize(count);
    benchmark::ClobberMemory();
  }
}
BENCHMARK(count_if_random)->Unit(benchmark::kMicrosecond);

static void count_if_sequenced(benchmark::State& state)
{
  std::vector<int> v(65536);
  std::generate(std::begin(v), std::end(v), [n = 0]() mutable {
    return (++n % 2) ? 1 : -1;
  });
  for (auto _ : state) {
    auto count = std::count_if(
      std::execution::seq,
      std::begin(v), std::end(v),
      [](auto x)
      {
        return x > 0;
      });
    benchmark::DoNotOptimize(count);
    benchmark::ClobberMemory();
  }
}
BENCHMARK(count_if_sequenced)->Unit(benchmark::kMicrosecond);

static void count_if_sorted(benchmark::State& state)
{
  std::vector<int> v(65536);
  std::generate(std::begin(v), std::end(v),
                [] {
                  return (rand() % 2) ? 1 : -1;
                });
  std::sort(v.begin(), v.end());
  for (auto _ : state) {
    auto count = std::count_if(
      std::execution::seq,
      std::begin(v), std::end(v),
      [](auto x)
      {
        return x > 0;
      });
    benchmark::DoNotOptimize(count);
    benchmark::ClobberMemory();
  }
}
BENCHMARK(count_if_sorted)->Unit(benchmark::kMicrosecond);

namespace branch_prediction
{
  struct price
  {
    virtual ~price() {}
    virtual float getPrice() const noexcept { return 1.0f; }
  };

  struct cheap : public price
  {
    float getPrice() const noexcept override { return 2.0f; }
  };

  struct expensive : public price
  {
    float getPrice() const noexcept override { return 3.14159f; }
  };

} // namespace branch_prediction

static void virtual_call_sequenced(benchmark::State& state)
{
  using namespace branch_prediction;
  std::vector<price*> pricelist;
  std::fill_n(std::back_inserter(pricelist), 10000, new price);
  std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
  std::fill_n(std::back_inserter(pricelist), 10000, new expensive);

  for (auto _ : state) {
    float sum = 0;
    for (auto* p : pricelist) {
      benchmark::DoNotOptimize(sum += p->getPrice());
    }
    benchmark::ClobberMemory();
  }
}
BENCHMARK(virtual_call_sequenced)->Unit(benchmark::kMicrosecond);

static void virtual_call_shuffled(benchmark::State& state)
{
  using namespace branch_prediction;
  std::vector<price*> pricelist;
  std::random_device rd;
  std::mt19937 g{ rd() };
  std::fill_n(std::back_inserter(pricelist), 10000, new price);
  std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
  std::fill_n(std::back_inserter(pricelist), 10000, new expensive);

  std::shuffle(pricelist.begin(), pricelist.end(), g);

  for (auto _ : state) {
    float sum = 0;
    for (auto* p : pricelist) {
      benchmark::DoNotOptimize(sum += p->getPrice());
    }
    benchmark::ClobberMemory();
  }
}
BENCHMARK(virtual_call_shuffled)->Unit(benchmark::kMicrosecond);

## cpu_cache_misses.cpp
#include <limits>
#include <benchmark/benchmark.h>
#include <type_traits>
#include <chrono>
#include <random>
#include <map>
#include <string>
#include <sstream>
#include <iostream>
#include <iomanip>
#include <execution>

namespace cpu_cache_misses {

  enum class traversal_order {
    row_major,
    column_major
  };

  template <unsigned int size>
  struct random_map
  {
    unsigned int map[size] = { 0 };
    random_map() {
      srand(static_cast<unsigned int>(time(NULL)));
      std::generate(std::execution::par, std::begin(map), std::end(map), [] { return std::rand() % size; });
    }

    unsigned int operator()() const {
      return map[++indx % size];
    }
  private:
    mutable unsigned long long indx = 0;
  };
  const static random_map<8 << 10> random_numbers;

  template <class T>
  class matrix {
  public:
    using type_t = typename std::decay<T>::type;
    using msize_t = unsigned long long;

    explicit matrix(msize_t matrix_size)
    {
      reset(matrix_size);
    }

    ~matrix()
    {
      if (data_ != nullptr) release();
    }

    void reset(msize_t matrix_size)
    {
      if (data_ != nullptr) release();
      matrix_dim = static_cast<msize_t>(sqrt(matrix_size));
      const auto dim_size = matrix_dim % 2 == 0 ? matrix_dim : matrix_dim + 1;
      row_size = dim_size;
      column_size = dim_size;
      // check if matrix is equal sized
      assert(row_size == column_size);

      data_ = new type_t * [row_size];
      for (msize_t i = 0; i < row_size; i++)
      {
        data_[i] = new type_t[column_size];
        // set initial values
        for (msize_t j = 0; j < column_size; j++)
        {
          // set random initial data
          data_[i][j] = static_cast<type_t>(random_numbers());
        }
      }
    }

    msize_t size() const {
      return matrix_dim * matrix_dim;
    }

    msize_t rows() const {
      return row_size;
    }

    msize_t columns() const {
      return column_size;
    }

    type_t* operator[](msize_t index) {
      return data_[index];
    }

    type_t* operator[](msize_t index) const {
      return data_[index];
    }

  protected:
    void release() {
      // delete allocated source
      for (size_t i = 0; i < row_size; i++)
      {
        delete[] data_[i];
      }
      delete[] data_;
      data_ = nullptr;
    }
  private:
    type_t** data_ = nullptr;
    msize_t row_size{ 0 };
    msize_t column_size{ 0 };
    msize_t matrix_dim{ 0 };
  };

} // namespace cpu_cache_misses

using namespace cpu_cache_misses;
static void sum_row_major(benchmark::State& state)
{
  using type = matrix<uint8_t>::msize_t;
  const type matrix_size = (1024 * 1024) * state.range(0);
  static matrix<uint8_t> mat(matrix_size);
  if (matrix_size != mat.size()) {
    mat.reset(matrix_size);
  }
  for (auto _ : state) {
    type sum = 0;
    for (type r = 0; r < mat.rows(); ++r) {
      for (type c = 0; c < mat.columns(); ++c) {
        benchmark::DoNotOptimize(sum += mat[r][c]);
      }
    }
    benchmark::ClobberMemory();
  }
}
BENCHMARK(sum_row_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
BENCHMARK(sum_row_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);

using namespace cpu_cache_misses;
static void sum_column_major(benchmark::State& state)
{
  using type = matrix<uint8_t>::msize_t;
  const type matrix_size = (1024 * 1024) * state.range(0);
  static matrix<uint8_t> mat(matrix_size);
  if (matrix_size != mat.size()) {
    mat.reset(matrix_size);
  }
  for (auto _ : state) {
    type sum = 0;
    for (type c = 0; c < mat.columns(); ++c) {
      for (type r = 0; r < mat.rows(); ++r) {
        benchmark::DoNotOptimize(sum += mat[r][c]);
      }
    }
    benchmark::ClobberMemory();
  }
}
BENCHMARK(sum_column_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
BENCHMARK(sum_column_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);

## false_sharing.cpp
#include <benchmark/benchmark.h>
#include <atomic>
#include <thread>
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <chrono>

template <typename input_t>
void work(input_t& a)
{
  for (int i = 0; i < 10000000; ++i)
    a++;
}

void work_with_sentinel(std::atomic<int>& a)
{
  thread_local unsigned int sentinel = 0;
  for (int i = 0; i < 10000000; ++i)
    ++sentinel;
  a += sentinel;
}

namespace sharing_single_atomic
{
  inline int test_1()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work(a); });

    t1.join();
    return a;
  }

  inline int test_2()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(a); });

    t1.join(); t2.join();
    return a;
  }

  inline int test_3()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(a); });
    std::thread t3([&] { work(a); });

    t1.join(); t2.join(); t3.join();
    return a;
  }

  inline int test_4()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(a); });
    std::thread t3([&] { work(a); });
    std::thread t4([&] { work(a); });

    t1.join(); t2.join(); t3.join(); t4.join();
    return a;
  }

  inline int test_1_sentinel()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work_with_sentinel(a); });

    t1.join();
    return a;
  }

  inline int test_2_sentinel()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work_with_sentinel(a); });
    std::thread t2([&] { work_with_sentinel(a); });

    t1.join(); t2.join();
    return a;
  }

  inline int test_3_sentinel()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work_with_sentinel(a); });
    std::thread t2([&] { work_with_sentinel(a); });
    std::thread t3([&] { work_with_sentinel(a); });

    t1.join(); t2.join(); t3.join();
    return a;
  }

  inline int test_4_sentinel()
  {
    std::atomic<int> a; a = 0;

    std::thread t1([&] { work_with_sentinel(a); });
    std::thread t2([&] { work_with_sentinel(a); });
    std::thread t3([&] { work_with_sentinel(a); });
    std::thread t4([&] { work_with_sentinel(a); });

    t1.join(); t2.join(); t3.join(); t4.join();
    return a;
  }
}

static void test_sharing_single_atomic(benchmark::State& state, int test_num)
{
  using namespace sharing_single_atomic;
  int (*fun)() = nullptr;
  switch (test_num)
  {
  case 1:
  fun = test_1;
  break;
  case 2:
  fun = test_2;
  break;
  case 3:
  fun = test_3;
  break;
  case 4:
  fun = test_4;
  break;
  case 5:
  fun = test_1_sentinel;
  break;
  case 6:
  fun = test_2_sentinel;
  break;
  case 7:
  fun = test_3_sentinel;
  break;
  case 8:
  fun = test_4_sentinel;
  break;
  default:
  {
    std::cerr << "undefined test number given!" << std::endl;
    std::terminate();
  }
  break;
  }
  for (auto _ : state) {
    benchmark::DoNotOptimize(fun());
  }
}
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded_with_sentinels, 5)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded_with_sentinels, 6)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded_with_sentinels, 7)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded_with_sentinels, 8)->UseRealTime()->Unit(benchmark::kMillisecond);

namespace sharing_atomics_in_one_chache_line
{
  inline int test_2()
  {
    std::atomic<int> a; a = 0;
    std::atomic<int> b; b = 0;

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(b); });

    t1.join(); t2.join();
    return a + b;
  }

  inline int test_3()
  {
    std::atomic<int> a; a = 0;
    std::atomic<int> b; b = 0;
    std::atomic<int> c; c = 0;

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(b); });
    std::thread t3([&] { work(c); });

    t1.join(); t2.join(); t3.join();
    return a + b + c;
  }

  inline int test_4()
  {
    std::atomic<int> a; a = 0; // &a: 0x...b2f7c0
    std::atomic<int> b; b = 0; // &b: 0x...b2f7c4
    std::atomic<int> c; c = 0; // &c: 0x...b2f7c8
    std::atomic<int> d; d = 0; // &d: 0x...b2f7cc

    std::thread t1([&] { work(a); });
    std::thread t2([&] { work(b); });
    std::thread t3([&] { work(c); });
    std::thread t4([&] { work(d); });

    t1.join(); t2.join(); t3.join(); t4.join();
    return a + b + c + d;
  }
}

static void test_sharing_atomics_in_one_chache_line(benchmark::State& state, int test_num)
{
  using namespace sharing_atomics_in_one_chache_line;
  int (*fun)() = nullptr;
  switch (test_num)
  {
  case 2:
  fun = test_2;
  break;
  case 3:
  fun = test_3;
  break;
  case 4:
  fun = test_4;
  break;
  default:
  {
    std::cerr << "undefined test number given!" << std::endl;
    std::terminate();
  }
  break;
  }
  for (auto _ : state) {
    benchmark::DoNotOptimize(fun());
  }
}
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);

namespace false_sharing_resolved
{
  // aligned type with the same size of cache line
  struct alignas(64) aligned_type
  {
    std::atomic<int> val;
  };
  static_assert(sizeof(aligned_type) == 64, "structure alignment must be same as cache line");

  inline int test_1()
  {
    aligned_type a; a.val = 0; // &a: 0x...4ff240

    std::thread t1([&a] { work(a.val); });

    t1.join();
    return a.val;
  }

  inline int test_2()
  {
    aligned_type a; a.val = 0; // &a: 0x...4ff240
    aligned_type b; b.val = 0; // &b: 0x...4ff280

    std::thread t1([&a] { work(a.val); });
    std::thread t2([&b] { work(b.val); });

    t1.join(); t2.join();
    return a.val + b.val;
  }

  inline int test_3()
  {
    aligned_type a; a.val = 0; // &a: 0x...4ff240
    aligned_type b; b.val = 0; // &b: 0x...4ff280
    aligned_type c; c.val = 0; // &c: 0x...4ff2c0

    std::thread t1([&a] { work(a.val); });
    std::thread t2([&b] { work(b.val); });
    std::thread t3([&c] { work(c.val); });

    t1.join(); t2.join(); t3.join();
    return a.val + b.val + c.val;
  }

  inline int test_4()
  {
    aligned_type a; a.val = 0; // &a: 0x...4ff240
    aligned_type b; b.val = 0; // &b: 0x...4ff280
    aligned_type c; c.val = 0; // &c: 0x...4ff2c0
    aligned_type d; d.val = 0; // &d: 0x...4ff300

    std::thread t1([&a] { work(a.val); });
    std::thread t2([&b] { work(b.val); });
    std::thread t3([&c] { work(c.val); });
    std::thread t4([&d] { work(d.val); });

    t1.join(); t2.join(); t3.join(); t4.join();
    return a.val + b.val + c.val + d.val;
  }
}

static void test_false_sharing_resolved(benchmark::State& state, int test_num)
{
  using namespace false_sharing_resolved;
  int (*fun)() = nullptr;
  switch (test_num)
  {
  case 1:
  fun = test_1;
  break;
  case 2:
  fun = test_2;
  break;
  case 3:
  fun = test_3;
  break;
  case 4:
  fun = test_4;
  break;
  default:
  {
    std::cerr << "undefined test number given!" << std::endl;
    std::terminate();
  }
  break;
  }
  for (auto _ : state) {
    benchmark::DoNotOptimize(fun());
  }
}
BENCHMARK_CAPTURE(test_false_sharing_resolved, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(test_false_sharing_resolved, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
	#include <benchmark/benchmark.h>
	#include <iostream>
	#include <random>
	#include <algorithm>
	#include <vector>
	#include <chrono>
	#include <iterator>
	#include <execution>

	static void count_if_random(benchmark::State& state)
	{
	std::vector<int> v(65536);
	std::generate(std::begin(v), std::end(v), []
	{
	return (rand() % 2) ? 1 : -1;
	});
	for (auto _ : state) {
	auto count = std::count_if(
	std::execution::seq,
	std::begin(v), std::end(v),
	[](auto x)
	{
	return x > 0;
	});
	benchmark::DoNotOptimize(count);
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(count_if_random)->Unit(benchmark::kMicrosecond);

	static void count_if_sequenced(benchmark::State& state)
	{
	std::vector<int> v(65536);
	std::generate(std::begin(v), std::end(v), [n = 0]() mutable {
	return (++n % 2) ? 1 : -1;
	});
	for (auto _ : state) {
	auto count = std::count_if(
	std::execution::seq,
	std::begin(v), std::end(v),
	[](auto x)
	{
	return x > 0;
	});
	benchmark::DoNotOptimize(count);
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(count_if_sequenced)->Unit(benchmark::kMicrosecond);

	static void count_if_sorted(benchmark::State& state)
	{
	std::vector<int> v(65536);
	std::generate(std::begin(v), std::end(v),
	[] {
	return (rand() % 2) ? 1 : -1;
	});
	std::sort(v.begin(), v.end());
	for (auto _ : state) {
	auto count = std::count_if(
	std::execution::seq,
	std::begin(v), std::end(v),
	[](auto x)
	{
	return x > 0;
	});
	benchmark::DoNotOptimize(count);
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(count_if_sorted)->Unit(benchmark::kMicrosecond);

	namespace branch_prediction
	{
	struct price
	{
	virtual ~price() {}
	virtual float getPrice() const noexcept { return 1.0f; }
	};

	struct cheap : public price
	{
	float getPrice() const noexcept override { return 2.0f; }
	};

	struct expensive : public price
	{
	float getPrice() const noexcept override { return 3.14159f; }
	};

	} // namespace branch_prediction

	static void virtual_call_sequenced(benchmark::State& state)
	{
	using namespace branch_prediction;
	std::vector<price*> pricelist;
	std::fill_n(std::back_inserter(pricelist), 10000, new price);
	std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
	std::fill_n(std::back_inserter(pricelist), 10000, new expensive);

	for (auto _ : state) {
	float sum = 0;
	for (auto* p : pricelist) {
	benchmark::DoNotOptimize(sum += p->getPrice());
	}
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(virtual_call_sequenced)->Unit(benchmark::kMicrosecond);

	static void virtual_call_shuffled(benchmark::State& state)
	{
	using namespace branch_prediction;
	std::vector<price*> pricelist;
	std::random_device rd;
	std::mt19937 g{ rd() };
	std::fill_n(std::back_inserter(pricelist), 10000, new price);
	std::fill_n(std::back_inserter(pricelist), 10000, new cheap);
	std::fill_n(std::back_inserter(pricelist), 10000, new expensive);

	std::shuffle(pricelist.begin(), pricelist.end(), g);

	for (auto _ : state) {
	float sum = 0;
	for (auto* p : pricelist) {
	benchmark::DoNotOptimize(sum += p->getPrice());
	}
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(virtual_call_shuffled)->Unit(benchmark::kMicrosecond);
	#include <limits>
	#include <benchmark/benchmark.h>
	#include <type_traits>
	#include <chrono>
	#include <random>
	#include <map>
	#include <string>
	#include <sstream>
	#include <iostream>
	#include <iomanip>
	#include <execution>

	namespace cpu_cache_misses {

	enum class traversal_order {
	row_major,
	column_major
	};

	template <unsigned int size>
	struct random_map
	{
	unsigned int map[size] = { 0 };
	random_map() {
	srand(static_cast<unsigned int>(time(NULL)));
	std::generate(std::execution::par, std::begin(map), std::end(map), [] { return std::rand() % size; });
	}

	unsigned int operator()() const {
	return map[++indx % size];
	}
	private:
	mutable unsigned long long indx = 0;
	};
	const static random_map<8 << 10> random_numbers;

	template <class T>
	class matrix {
	public:
	using type_t = typename std::decay<T>::type;
	using msize_t = unsigned long long;

	explicit matrix(msize_t matrix_size)
	{
	reset(matrix_size);
	}

	~matrix()
	{
	if (data_ != nullptr) release();
	}

	void reset(msize_t matrix_size)
	{
	if (data_ != nullptr) release();
	matrix_dim = static_cast<msize_t>(sqrt(matrix_size));
	const auto dim_size = matrix_dim % 2 == 0 ? matrix_dim : matrix_dim + 1;
	row_size = dim_size;
	column_size = dim_size;
	// check if matrix is equal sized
	assert(row_size == column_size);

	data_ = new type_t * [row_size];
	for (msize_t i = 0; i < row_size; i++)
	{
	data_[i] = new type_t[column_size];
	// set initial values
	for (msize_t j = 0; j < column_size; j++)
	{
	// set random initial data
	data_[i][j] = static_cast<type_t>(random_numbers());
	}
	}
	}

	msize_t size() const {
	return matrix_dim * matrix_dim;
	}

	msize_t rows() const {
	return row_size;
	}

	msize_t columns() const {
	return column_size;
	}

	type_t* operator[](msize_t index) {
	return data_[index];
	}

	type_t* operator[](msize_t index) const {
	return data_[index];
	}

	protected:
	void release() {
	// delete allocated source
	for (size_t i = 0; i < row_size; i++)
	{
	delete[] data_[i];
	}
	delete[] data_;
	data_ = nullptr;
	}
	private:
	type_t** data_ = nullptr;
	msize_t row_size{ 0 };
	msize_t column_size{ 0 };
	msize_t matrix_dim{ 0 };
	};

	} // namespace cpu_cache_misses

	using namespace cpu_cache_misses;
	static void sum_row_major(benchmark::State& state)
	{
	using type = matrix<uint8_t>::msize_t;
	const type matrix_size = (1024 * 1024) * state.range(0);
	static matrix<uint8_t> mat(matrix_size);
	if (matrix_size != mat.size()) {
	mat.reset(matrix_size);
	}
	for (auto _ : state) {
	type sum = 0;
	for (type r = 0; r < mat.rows(); ++r) {
	for (type c = 0; c < mat.columns(); ++c) {
	benchmark::DoNotOptimize(sum += mat[r][c]);
	}
	}
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(sum_row_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
	BENCHMARK(sum_row_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);

	using namespace cpu_cache_misses;
	static void sum_column_major(benchmark::State& state)
	{
	using type = matrix<uint8_t>::msize_t;
	const type matrix_size = (1024 * 1024) * state.range(0);
	static matrix<uint8_t> mat(matrix_size);
	if (matrix_size != mat.size()) {
	mat.reset(matrix_size);
	}
	for (auto _ : state) {
	type sum = 0;
	for (type c = 0; c < mat.columns(); ++c) {
	for (type r = 0; r < mat.rows(); ++r) {
	benchmark::DoNotOptimize(sum += mat[r][c]);
	}
	}
	benchmark::ClobberMemory();
	}
	}
	BENCHMARK(sum_column_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond);
	BENCHMARK(sum_column_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond);
	#include <benchmark/benchmark.h>
	#include <atomic>
	#include <thread>
	#include <iostream>
	#include <random>
	#include <algorithm>
	#include <vector>
	#include <chrono>

	template <typename input_t>
	void work(input_t& a)
	{
	for (int i = 0; i < 10000000; ++i)
	a++;
	}

	void work_with_sentinel(std::atomic<int>& a)
	{
	thread_local unsigned int sentinel = 0;
	for (int i = 0; i < 10000000; ++i)
	++sentinel;
	a += sentinel;
	}

	namespace sharing_single_atomic
	{
	inline int test_1()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work(a); });

	t1.join();
	return a;
	}

	inline int test_2()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(a); });

	t1.join(); t2.join();
	return a;
	}

	inline int test_3()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(a); });
	std::thread t3([&] { work(a); });

	t1.join(); t2.join(); t3.join();
	return a;
	}

	inline int test_4()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(a); });
	std::thread t3([&] { work(a); });
	std::thread t4([&] { work(a); });

	t1.join(); t2.join(); t3.join(); t4.join();
	return a;
	}

	inline int test_1_sentinel()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work_with_sentinel(a); });

	t1.join();
	return a;
	}

	inline int test_2_sentinel()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work_with_sentinel(a); });
	std::thread t2([&] { work_with_sentinel(a); });

	t1.join(); t2.join();
	return a;
	}

	inline int test_3_sentinel()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work_with_sentinel(a); });
	std::thread t2([&] { work_with_sentinel(a); });
	std::thread t3([&] { work_with_sentinel(a); });

	t1.join(); t2.join(); t3.join();
	return a;
	}

	inline int test_4_sentinel()
	{
	std::atomic<int> a; a = 0;

	std::thread t1([&] { work_with_sentinel(a); });
	std::thread t2([&] { work_with_sentinel(a); });
	std::thread t3([&] { work_with_sentinel(a); });
	std::thread t4([&] { work_with_sentinel(a); });

	t1.join(); t2.join(); t3.join(); t4.join();
	return a;
	}
	}

	static void test_sharing_single_atomic(benchmark::State& state, int test_num)
	{
	using namespace sharing_single_atomic;
	int (*fun)() = nullptr;
	switch (test_num)
	{
	case 1:
	fun = test_1;
	break;
	case 2:
	fun = test_2;
	break;
	case 3:
	fun = test_3;
	break;
	case 4:
	fun = test_4;
	break;
	case 5:
	fun = test_1_sentinel;
	break;
	case 6:
	fun = test_2_sentinel;
	break;
	case 7:
	fun = test_3_sentinel;
	break;
	case 8:
	fun = test_4_sentinel;
	break;
	default:
	{
	std::cerr << "undefined test number given!" << std::endl;
	std::terminate();
	}
	break;
	}
	for (auto _ : state) {
	benchmark::DoNotOptimize(fun());
	}
	}
	BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded_with_sentinels, 5)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded_with_sentinels, 6)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded_with_sentinels, 7)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded_with_sentinels, 8)->UseRealTime()->Unit(benchmark::kMillisecond);

	namespace sharing_atomics_in_one_chache_line
	{
	inline int test_2()
	{
	std::atomic<int> a; a = 0;
	std::atomic<int> b; b = 0;

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(b); });

	t1.join(); t2.join();
	return a + b;
	}

	inline int test_3()
	{
	std::atomic<int> a; a = 0;
	std::atomic<int> b; b = 0;
	std::atomic<int> c; c = 0;

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(b); });
	std::thread t3([&] { work(c); });

	t1.join(); t2.join(); t3.join();
	return a + b + c;
	}

	inline int test_4()
	{
	std::atomic<int> a; a = 0; // &a: 0x...b2f7c0
	std::atomic<int> b; b = 0; // &b: 0x...b2f7c4
	std::atomic<int> c; c = 0; // &c: 0x...b2f7c8
	std::atomic<int> d; d = 0; // &d: 0x...b2f7cc

	std::thread t1([&] { work(a); });
	std::thread t2([&] { work(b); });
	std::thread t3([&] { work(c); });
	std::thread t4([&] { work(d); });

	t1.join(); t2.join(); t3.join(); t4.join();
	return a + b + c + d;
	}
	}

	static void test_sharing_atomics_in_one_chache_line(benchmark::State& state, int test_num)
	{
	using namespace sharing_atomics_in_one_chache_line;
	int (*fun)() = nullptr;
	switch (test_num)
	{
	case 2:
	fun = test_2;
	break;
	case 3:
	fun = test_3;
	break;
	case 4:
	fun = test_4;
	break;
	default:
	{
	std::cerr << "undefined test number given!" << std::endl;
	std::terminate();
	}
	break;
	}
	for (auto _ : state) {
	benchmark::DoNotOptimize(fun());
	}
	}
	BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);

	namespace false_sharing_resolved
	{
	// aligned type with the same size of cache line
	struct alignas(64) aligned_type
	{
	std::atomic<int> val;
	};
	static_assert(sizeof(aligned_type) == 64, "structure alignment must be same as cache line");

	inline int test_1()
	{
	aligned_type a; a.val = 0; // &a: 0x...4ff240

	std::thread t1([&a] { work(a.val); });

	t1.join();
	return a.val;
	}

	inline int test_2()
	{
	aligned_type a; a.val = 0; // &a: 0x...4ff240
	aligned_type b; b.val = 0; // &b: 0x...4ff280

	std::thread t1([&a] { work(a.val); });
	std::thread t2([&b] { work(b.val); });

	t1.join(); t2.join();
	return a.val + b.val;
	}

	inline int test_3()
	{
	aligned_type a; a.val = 0; // &a: 0x...4ff240
	aligned_type b; b.val = 0; // &b: 0x...4ff280
	aligned_type c; c.val = 0; // &c: 0x...4ff2c0

	std::thread t1([&a] { work(a.val); });
	std::thread t2([&b] { work(b.val); });
	std::thread t3([&c] { work(c.val); });

	t1.join(); t2.join(); t3.join();
	return a.val + b.val + c.val;
	}

	inline int test_4()
	{
	aligned_type a; a.val = 0; // &a: 0x...4ff240
	aligned_type b; b.val = 0; // &b: 0x...4ff280
	aligned_type c; c.val = 0; // &c: 0x...4ff2c0
	aligned_type d; d.val = 0; // &d: 0x...4ff300

	std::thread t1([&a] { work(a.val); });
	std::thread t2([&b] { work(b.val); });
	std::thread t3([&c] { work(c.val); });
	std::thread t4([&d] { work(d.val); });

	t1.join(); t2.join(); t3.join(); t4.join();
	return a.val + b.val + c.val + d.val;
	}
	}

	static void test_false_sharing_resolved(benchmark::State& state, int test_num)
	{
	using namespace false_sharing_resolved;
	int (*fun)() = nullptr;
	switch (test_num)
	{
	case 1:
	fun = test_1;
	break;
	case 2:
	fun = test_2;
	break;
	case 3:
	fun = test_3;
	break;
	case 4:
	fun = test_4;
	break;
	default:
	{
	std::cerr << "undefined test number given!" << std::endl;
	std::terminate();
	}
	break;
	}
	for (auto _ : state) {
	benchmark::DoNotOptimize(fun());
	}
	}
	BENCHMARK_CAPTURE(test_false_sharing_resolved, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_false_sharing_resolved, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_false_sharing_resolved, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond);
	BENCHMARK_CAPTURE(test_false_sharing_resolved, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond);