Skip to content

Instantly share code, notes, and snippets.

@LucHermitte
Last active August 31, 2020 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LucHermitte/3934a9d436cbf513866f8aa2eb329fb8 to your computer and use it in GitHub Desktop.
Save LucHermitte/3934a9d436cbf513866f8aa2eb329fb8 to your computer and use it in GitHub Desktop.
// Code inspired by Chandler Carruth's presentation at CppCon17:
// "Going Nowhere Faster"
// Written from scratch by Luc Hermitte, 2018-2020
// Licence: Boost Software Licence
#ifndef RNG_H
#define RNG_H
#include <random>
#include <cassert>
#include <limits>
template <typename T>
class RNG
{
public:
explicit RNG(T min_, T max_, int count_)
: m_device()
, m_engine(m_device())
, m_distribution(min_, max_)
, m_count(count_)
{}
explicit RNG(int count)
: RNG(std::numeric_limits<T>::min(), std::numeric_limits<T>::max(), count)
{}
struct iterator {
typedef std::forward_iterator_tag iterator_category;
typedef std::ptrdiff_t difference_type;
typedef T value_type;
typedef T* pointer;
typedef T& reference;
explicit iterator(RNG const& rng_, int count_) : rng(rng_), count(count_){}
T operator*() const { return rng.generate(); }
friend bool operator==(iterator const& lhs, iterator const& rhs) {
assert(&lhs.rng == &rhs.rng);
return lhs.count == rhs.count;
}
friend bool operator!=(iterator const& lhs, iterator const& rhs) {
return !(lhs == rhs);
}
iterator& operator++() { --count; return *this; }
iterator operator++(int) { iterator tmp; ++(*this); return tmp; }
private:
RNG const& rng;
int count;
};
iterator begin() const { return iterator{*this, m_count}; }
iterator end () const { return iterator{*this, 0}; }
T generate() const { return m_distribution(m_engine); }
private:
std::random_device mutable m_device;
std::default_random_engine mutable m_engine;
std::uniform_int_distribution<T> mutable m_distribution;
int m_count;
};
#endif // RNG_H
// Luc Hermitte, 2020
// Licence: BSL
// Compile with:
// PKG_CONFIG_PATH="/usr/local/4clang/lib/pkgconfig:${PKG_CONFIG_PATH}" LDFLAGS="$(pkg-config --libs-only-L benchmark)" CXX='clang++' CXXFLAGS="-O3 -DNDEBUG -march=native -stdlib=libc++ -std=c++2a $(pkg-config --cflags-only-I benchmark) -DLEVEL1_DCACHE_LINESIZE=$(getconf LEVEL1_DCACHE_LINESIZE)" make type-iter
#include "RNG.hpp"
#include <benchmark/benchmark.h>
#include <boost/align/aligned_allocator.hpp>
#include <vector>
#include <span>
#if defined(__arm__)
constexpr bool isArm = true;
#else
constexpr bool isArm = false;
#endif
constexpr std::size_t hardware_destructive_interference_size = isArm ? 64 : 128;
// static_assert(hardware_destructive_interference_size >= max_align_v, "math?");
#if defined(LEVEL1_DCACHE_LINESIZE)
constexpr std::size_t hardware_constructive_interference_size = LEVEL1_DCACHE_LINESIZE;
#else
constexpr std::size_t hardware_constructive_interference_size = 64;
#endif
template <typename T, std::size_t alignment = hardware_constructive_interference_size>
using aligned_vector
= std::vector<T, boost::alignment::aligned_allocator<T, alignment>>;
template <typename IndexType>
int sum_with_size_in_loop(std::span<int const> s)
{
int r = 0;
for (IndexType i = 0; i != s.size(); ++i)
r += s[i];
return r;
}
template <typename IndexType>
int sum_with_cached_size(std::span<int const> s)
{
int r = 0;
IndexType const N = s.size();
for (IndexType i = 0; i != N; ++i)
r += s[i];
return r;
}
int sum_for_range(std::span<const int> s)
{
int r = 0;
for (auto e : s)
r += e;
return r;
}
int sum_accumulate(std::span<const int> s)
{
return std::accumulate(s.begin(), s.end(), 0);
}
struct DontCacheSize{};
struct CacheSize{};
template <typename IndexType, typename Caching>
struct Sum { };
template <typename IndexType>
struct Sum<IndexType, DontCacheSize>
{
static int compute(std::span<int const> s) {
return sum_with_size_in_loop<IndexType>(s);
}
};
template <typename IndexType>
struct Sum<IndexType, CacheSize>
{
static int compute(std::span<int const> s) {
return sum_with_cached_size<IndexType>(s);
}
};
template <typename IndexType, typename Caching>
static void accu_bench(benchmark::State & s) {
int bytes = 1 << s.range(0);
// Compute how many elements we can fit in this many bytes
int count = bytes / sizeof(int);
aligned_vector<int> v; v.reserve(count);
for (auto i : RNG<int>(count)) {
v.push_back(i);
}
for (auto _ : s) {
int sum = Sum<IndexType, Caching>::compute(v);
benchmark::DoNotOptimize(sum);
benchmark::ClobberMemory();
}
s.SetBytesProcessed(long(s.iterations()) * long(bytes));
s.SetLabel(std::to_string(bytes / 1024) + "kb");
}
BENCHMARK_TEMPLATE(accu_bench, std::size_t, CacheSize) ->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, std::size_t, DontCacheSize)->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, std::ptrdiff_t, CacheSize) ->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, std::ptrdiff_t, DontCacheSize)->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, unsigned int, CacheSize) ->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, unsigned int, DontCacheSize)->Arg(15)->Arg(18)->Arg(2)->ReportAggregatesOnly(true);
// BENCHMARK_TEMPLATE(accu_bench, int, CacheSize) ->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
BENCHMARK_TEMPLATE(accu_bench, int, DontCacheSize)->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
static void accu_bench_for_range(benchmark::State & s) {
int bytes = 1 << s.range(0);
// Compute how many elements we can fit in this many bytes
int count = bytes / sizeof(int);
aligned_vector<int> v; v.reserve(count);
for (auto i : RNG<int>(count)) {
v.push_back(i);
}
for (auto _ : s) {
int sum = sum_for_range(v);
benchmark::DoNotOptimize(sum);
benchmark::ClobberMemory();
}
s.SetBytesProcessed(long(s.iterations()) * long(bytes));
s.SetLabel(std::to_string(bytes / 1024) + "kb");
}
BENCHMARK(accu_bench_for_range)->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
static void accu_bench_accumulate(benchmark::State & s) {
int bytes = 1 << s.range(0);
// Compute how many elements we can fit in this many bytes
int count = bytes / sizeof(int);
aligned_vector<int> v; v.reserve(count);
for (auto i : RNG<int>(count)) {
v.push_back(i);
}
for (auto _ : s) {
int sum = sum_accumulate(v);
benchmark::DoNotOptimize(sum);
benchmark::ClobberMemory();
}
s.SetBytesProcessed(long(s.iterations()) * long(bytes));
s.SetLabel(std::to_string(bytes / 1024) + "kb");
}
BENCHMARK(accu_bench_accumulate)->Arg(15)->Arg(18)->Arg(22)->ReportAggregatesOnly(true);
BENCHMARK_MAIN();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment