-
-
Save forrestthewoods/4ee84b62c581956bebbe124cd1c26212 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <array> | |
#include <vector> | |
#include <random> | |
#include <future> | |
#include <chrono> | |
#include <atomic> | |
#include <string> | |
#include <sstream> | |
#include <algorithm> | |
#include <typeinfo> | |
#include <assert.h> | |
#include <immintrin.h> | |
using namespace std::chrono; | |
using namespace std; | |
// Enable to track stuff | |
constexpr bool COUNT_EXTRA = false; | |
// Typedefs | |
using TimePoint = std::chrono::steady_clock::time_point; | |
// Fast, dumb random number generator | |
// Via https://stackoverflow.com/questions/1640258/need-a-fast-random-generator-for-c | |
static uint64_t x = 123456789, y = 362436069, z = 521288629; | |
uint64_t static_rng(void) { | |
uint64_t t; | |
x ^= x << 16; | |
x ^= x >> 5; | |
x ^= x << 1; | |
t = x; | |
x = y; | |
y = z; | |
z = t ^ x ^ y; | |
return z; | |
} | |
struct rng { | |
uint64_t x = 123456789; | |
uint64_t y = 362436069; | |
uint64_t z = 521288629; | |
uint64_t gen() { | |
uint64_t t; | |
x ^= x << 16; | |
x ^= x >> 5; | |
x ^= x << 1; | |
t = x; | |
x = y; | |
y = z; | |
z = t ^ x ^ y; | |
return z; | |
} | |
}; | |
// Helper to generate a small integer of type T | |
template<typename T> | |
T rng_small_num() { | |
constexpr T RNG_MIN = 0; | |
constexpr T RNG_MAX = 1024; | |
constexpr T RNG_DIFF = RNG_MAX - RNG_MIN; | |
return T(RNG_MIN + static_rng() % RNG_DIFF); | |
} | |
// Fill a span with random numbers | |
template<typename T> | |
void fill_rng(T* begin, T* end) { | |
for (auto ptr = begin; ptr < end; ++ptr) | |
*ptr = rng_small_num<T>(); | |
} | |
// Print utility | |
std::string str_from_bytes(size_t num_bytes) { | |
int div = 0; | |
while (num_bytes >= 1024) { | |
num_bytes /= 1024; | |
div += 1; | |
} | |
// I hate C++ so much | |
std::ostringstream ss; | |
ss << num_bytes; | |
if (div == 0) { ss << " bytes"; } | |
else if (div == 1) { ss << "Kb"; } | |
else if (div == 2) { ss << "Mb"; } | |
else if (div == 3) { ss << "Gb"; } | |
else if (div == 4) { ss << "Tb"; } | |
else { ss << "unexpected"; } | |
return ss.str(); | |
} | |
// A begin/end pointer pair with some utilities | |
template<typename T> | |
struct Slice { | |
T* _begin = nullptr; | |
T* _end = nullptr; | |
Slice() = default; | |
Slice(T* s, T* e) : _begin(s), _end(e) { if (_end < _begin) { std::abort(); } } | |
Slice(Slice const& other) = default; | |
Slice(std::vector<T>& v) : _begin(v.data()), _end(v.data() + v.size()) {} | |
Slice subslice(size_t offset) { return Slice(begin + offset, _end); } | |
Slice subslice(size_t offset, size_t new_length) { return Slice(begin + offset, _begin + offset + new_length); } | |
__forceinline T* begin() { return _begin; } | |
__forceinline T* end() { return _end; } | |
__forceinline size_t size() const { return _end - _begin; } | |
__forceinline bool is_empty() const { return size() == 0; } | |
__forceinline T const & operator[](size_t index) const { return _begin[index]; } | |
template<size_t N> | |
static Slice<T> from_array(std::array<T, N>& arr) { | |
return Slice<T>(arr.data(), arr.data() + arr.size()); | |
} | |
}; | |
// Structure holding summation result and timing info | |
struct Result { | |
int64_t sum = 0; | |
int64_t bytes_touched = 0; | |
int64_t ops = 0; | |
TimePoint start; | |
TimePoint end; | |
Result() = default; | |
Result(int64_t _sum, int64_t _bytes_touched, int64_t _ops, TimePoint _start, TimePoint _end) | |
: sum(_sum), bytes_touched(_bytes_touched), ops(_ops), start(_start), end(_end) {} | |
Result& operator+=(Result const& other) { | |
sum += other.sum; | |
if constexpr (COUNT_EXTRA) { | |
bytes_touched += other.bytes_touched; | |
ops += other.ops; | |
} | |
return *this; | |
} | |
}; | |
// Structure holding 16 ints | |
struct matrix4x4 { | |
std::array<int, 16> nums; | |
static matrix4x4 gen_random() { | |
matrix4x4 result; | |
fill_rng(result.nums.data(), result.nums.data() + result.nums.size()); | |
return result; | |
} | |
}; | |
// Structure holding 16 ints with functions that operate in simd | |
struct matrix4x4_simd { | |
std::array<int, 16> nums; | |
static matrix4x4_simd gen_random() { | |
matrix4x4_simd result; | |
fill_rng(result.nums.data(), result.nums.data() + result.nums.size()); | |
return result; | |
} | |
}; | |
// Structure to hold a unique_ptr<matrix4x4> | |
struct unique_matrix4x4 { | |
std::unique_ptr<matrix4x4> ptr; | |
static unique_matrix4x4 gen_random() { | |
unique_matrix4x4 result; | |
result.ptr = std::make_unique<matrix4x4>(); | |
fill_rng(result.ptr->nums.data(), result.ptr->nums.data() + result.ptr->nums.size()); | |
return result; | |
} | |
}; | |
// Utilities to generate random elements | |
template<typename T> T generate_random_element() { return rng_small_num<T>(); } | |
template<> matrix4x4 generate_random_element() { return matrix4x4::gen_random(); } | |
template<> matrix4x4_simd generate_random_element() { return matrix4x4_simd::gen_random(); } | |
template<> unique_matrix4x4 generate_random_element() { return unique_matrix4x4::gen_random(); } | |
template<typename T> | |
Result sum(Slice<T> data) { | |
constexpr size_t element_size = sizeof(T); | |
Result result; | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
for (auto ptr = begin; ptr < end; ++ptr) { | |
result.sum += *ptr; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 1; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
template<typename T> | |
Result sum(Slice<T> data, Slice<uint32_t> indices) { | |
constexpr size_t element_size = sizeof(T); | |
Result result; | |
for (uint32_t idx : indices) { | |
result.sum += data[idx]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 1; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
template<> | |
Result sum(Slice<matrix4x4> data) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
for (auto ptr = begin; ptr < end; ++ptr) { | |
result.sum += ptr->nums[0]; result.sum += ptr->nums[1]; result.sum += ptr->nums[2]; result.sum += ptr->nums[3]; | |
result.sum += ptr->nums[4]; result.sum += ptr->nums[5]; result.sum += ptr->nums[6]; result.sum += ptr->nums[7]; | |
result.sum += ptr->nums[8]; result.sum += ptr->nums[9]; result.sum += ptr->nums[10]; result.sum += ptr->nums[11]; | |
result.sum += ptr->nums[12]; result.sum += ptr->nums[13]; result.sum += ptr->nums[14]; result.sum += ptr->nums[15]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 16; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
template<> | |
Result sum(Slice<matrix4x4> data, Slice<uint32_t> indices) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = indices.begin(); | |
const auto end = indices.end(); | |
for (uint32_t idx : indices) { | |
auto const & nums = data[idx].nums; | |
result.sum += nums[0]; result.sum += nums[1]; result.sum += nums[2]; result.sum += nums[3]; | |
result.sum += nums[4]; result.sum += nums[5]; result.sum += nums[6]; result.sum += nums[7]; | |
result.sum += nums[8]; result.sum += nums[9]; result.sum += nums[10]; result.sum += nums[11]; | |
result.sum += nums[12]; result.sum += nums[13]; result.sum += nums[14]; result.sum += nums[15]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 16; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
// Only one version of this because I'm lazy. | |
Result inc_sum(Slice<matrix4x4> data) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
result.start = std::chrono::high_resolution_clock::now(); | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
for (auto ptr = begin; ptr < end; ++ptr) { | |
ptr->nums[0] += 1; ptr->nums[1] += 1; ptr->nums[2] += 1; ptr->nums[3] += 1; | |
ptr->nums[4] += 1; ptr->nums[5] += 1; ptr->nums[6] += 1; ptr->nums[7] += 1; | |
ptr->nums[8] += 1; ptr->nums[9] += 1; ptr->nums[10] += 1; ptr->nums[11] += 1; | |
ptr->nums[12] += 1; ptr->nums[13] += 1; ptr->nums[14] += 1; ptr->nums[15] += 1; | |
result.sum += ptr->nums[0]; result.sum += ptr->nums[1]; result.sum += ptr->nums[2]; result.sum += ptr->nums[3]; | |
result.sum += ptr->nums[4]; result.sum += ptr->nums[5]; result.sum += ptr->nums[6]; result.sum += ptr->nums[7]; | |
result.sum += ptr->nums[8]; result.sum += ptr->nums[9]; result.sum += ptr->nums[10]; result.sum += ptr->nums[11]; | |
result.sum += ptr->nums[12]; result.sum += ptr->nums[13]; result.sum += ptr->nums[14]; result.sum += ptr->nums[15]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 16; | |
result.bytes_touched += element_size; | |
} | |
} | |
result.end = std::chrono::high_resolution_clock::now(); | |
return result; | |
} | |
template<> | |
Result sum(Slice<matrix4x4_simd> data) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
__m256i simd_sum{ 0 }; | |
for (auto ptr = begin; ptr < end; ++ptr) { | |
int* data = ptr->nums.data(); | |
simd_sum = _mm256_add_epi32(simd_sum, _mm256_load_si256(reinterpret_cast<__m256i const*>(data))); | |
simd_sum = _mm256_add_epi32(simd_sum, _mm256_load_si256(reinterpret_cast<__m256i const*>(data + 8))); | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 2; | |
result.bytes_touched += element_size; | |
} | |
} | |
result.sum += simd_sum.m256i_i32[0]; | |
result.sum += simd_sum.m256i_i32[1]; | |
result.sum += simd_sum.m256i_i32[2]; | |
result.sum += simd_sum.m256i_i32[3]; | |
result.sum += simd_sum.m256i_i32[4]; | |
result.sum += simd_sum.m256i_i32[5]; | |
result.sum += simd_sum.m256i_i32[6]; | |
result.sum += simd_sum.m256i_i32[7]; | |
return result; | |
} | |
template<> | |
Result sum(Slice<matrix4x4_simd> data, Slice<uint32_t> indices) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
__m256i simd_sum{ 0 }; | |
for (auto idx : indices) { | |
int const* ptr = data[idx].nums.data(); | |
simd_sum = _mm256_add_epi32(simd_sum, _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr))); | |
simd_sum = _mm256_add_epi32(simd_sum, _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr + 8))); | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 2; | |
result.bytes_touched += element_size; | |
} | |
} | |
result.sum += simd_sum.m256i_i32[0]; | |
result.sum += simd_sum.m256i_i32[1]; | |
result.sum += simd_sum.m256i_i32[2]; | |
result.sum += simd_sum.m256i_i32[3]; | |
result.sum += simd_sum.m256i_i32[4]; | |
result.sum += simd_sum.m256i_i32[5]; | |
result.sum += simd_sum.m256i_i32[6]; | |
result.sum += simd_sum.m256i_i32[7]; | |
return result; | |
} | |
template<> | |
Result sum(Slice<unique_matrix4x4> data) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = data.begin(); | |
const auto end = data.end(); | |
for (auto ptr = begin; ptr < end; ++ptr) { | |
auto const& nums = ptr->ptr->nums; | |
result.sum += nums[0]; result.sum += nums[1]; result.sum += nums[2]; result.sum += nums[3]; | |
result.sum += nums[4]; result.sum += nums[5]; result.sum += nums[6]; result.sum += nums[7]; | |
result.sum += nums[8]; result.sum += nums[9]; result.sum += nums[10]; result.sum += nums[11]; | |
result.sum += nums[12]; result.sum += nums[13]; result.sum += nums[14]; result.sum += nums[15]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 16; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
template<> | |
Result sum(Slice<unique_matrix4x4> data, Slice<uint32_t> indices) { | |
constexpr size_t element_size = sizeof(matrix4x4); | |
assert(element_size == 16 * sizeof(int)); | |
Result result; | |
const auto begin = indices.begin(); | |
const auto end = indices.end(); | |
for (uint32_t idx : indices) { | |
auto const& nums = data[idx].ptr->nums; | |
result.sum += nums[0]; result.sum += nums[1]; result.sum += nums[2]; result.sum += nums[3]; | |
result.sum += nums[4]; result.sum += nums[5]; result.sum += nums[6]; result.sum += nums[7]; | |
result.sum += nums[8]; result.sum += nums[9]; result.sum += nums[10]; result.sum += nums[11]; | |
result.sum += nums[12]; result.sum += nums[13]; result.sum += nums[14]; result.sum += nums[15]; | |
if constexpr (COUNT_EXTRA) { | |
result.ops += 16; | |
result.bytes_touched += element_size; | |
} | |
} | |
return result; | |
} | |
// Numerical constants | |
constexpr int64_t THOUSAND = 1000; | |
constexpr int64_t MILLION = THOUSAND * THOUSAND; | |
constexpr int64_t BILLION = MILLION * THOUSAND; | |
constexpr size_t KILO = 1024; | |
constexpr size_t MEGA = KILO * 1024; | |
constexpr size_t GIGA = MEGA * 1024; | |
// System constants | |
constexpr size_t CACHE_LINE = 64; | |
constexpr size_t L1_CORE = KILO * 32; | |
constexpr size_t L2_CORE = KILO * 256; | |
constexpr size_t L3_CORE = MEGA * 2; | |
constexpr size_t NUM_CORES = 6; | |
// Time utilities | |
inline auto ns_since(std::chrono::time_point<std::chrono::steady_clock> start) { | |
return duration_cast<nanoseconds>(high_resolution_clock::now() - start).count(); | |
} | |
inline auto us_since(std::chrono::time_point<std::chrono::steady_clock> start) { | |
return duration_cast<microseconds>(high_resolution_clock::now() - start).count(); | |
} | |
inline auto ms_since(std::chrono::time_point<std::chrono::steady_clock> start) { | |
return duration_cast<milliseconds>(high_resolution_clock::now() - start).count(); | |
} | |
inline auto ms_since_and_reset(std::chrono::time_point<std::chrono::steady_clock> & start) { | |
auto now = high_resolution_clock::now(); | |
auto result = duration_cast<milliseconds>(now - start).count(); | |
start = now; | |
return result; | |
} | |
inline auto ns_to_ms(int64_t ns) { return ns / 1000 / 1000; } | |
template<typename T, typename GEN> | |
std::vector<T> generate_elements(size_t num_elements, GEN gen) { | |
std::cout << "Begin alloc. Elements: [" << num_elements << "] Bytes: [" << num_elements * sizeof(T) << "]"; | |
auto time_start = high_resolution_clock::now(); | |
std::vector<T> result; | |
result.reserve(num_elements); | |
for (auto i = 0; i < num_elements; ++i) { | |
result.push_back(gen()); | |
} | |
std::cout << " ... completed in " << ms_since(time_start) << " ms" << std::endl; | |
return result; | |
} | |
template<typename T, typename GEN> | |
std::vector<T> generate_bytes(size_t num_bytes, GEN gen) { | |
if (num_bytes % sizeof(T) != 0) | |
std::abort(); | |
auto num_elements = num_bytes / sizeof(T); | |
return generate_elements<T,GEN>(num_elements, gen); | |
} | |
template<typename T> | |
std::vector<T> generate_shuffled_indices(size_t count) { | |
std::vector<T> result; | |
result.reserve(count); | |
std::vector<T> temp; | |
temp.reserve(count); | |
// Fill temp | |
for (size_t i = 0; i < count; ++i) { | |
temp.push_back((T)i); | |
} | |
// Shuffle into result | |
while (count > 1) { | |
// Copy random value | |
auto index = static_rng() % count; | |
auto value = temp[index]; | |
result.push_back(value); | |
// Replace value with last value and decrement size | |
temp[index] = temp[count - 1]; | |
count -= 1; | |
} | |
result.push_back(temp[0]); | |
return result; | |
} | |
template<typename T> | |
std::vector<Slice<T>> generate_slices(Slice<T> data, size_t num_slices) { | |
std::vector<Slice<T>> result; | |
size_t slice_len = data.size() / num_slices; | |
for (auto i = 0; i < num_slices - 1; ++i) { | |
auto begin = data.begin() + slice_len * i; | |
auto end = begin + slice_len; | |
result.emplace_back(begin, end); | |
} | |
result.emplace_back(data.begin() + slice_len * (num_slices - 1), data.end()); | |
return result; | |
} | |
template<typename T> | |
std::vector<Slice<T>> generate_slices(Slice<T> data, size_t num_slices, size_t elements_per_slice) { | |
if (elements_per_slice * num_slices > data.size()) | |
std::abort(); | |
std::vector<Slice<T>> result; | |
for (auto i = 0; i < num_slices; ++i) { | |
auto begin = data.begin() + elements_per_slice * i; | |
auto end = begin + elements_per_slice; | |
result.emplace_back(begin, end); | |
} | |
return result; | |
} | |
template<typename T> | |
Result sum(Slice<T> slice, size_t elements_to_read) { | |
Result result; | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (;;) { | |
if (elements_to_read > slice.size()) { | |
result += sum(slice); | |
elements_to_read -= slice.size(); | |
} else { | |
if (elements_to_read > 0) { | |
auto begin = slice.begin(); | |
auto end = begin + elements_to_read; | |
auto subslice = Slice<T>(begin, end); | |
result += sum(subslice); | |
} | |
break; | |
} | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
result.start = start; | |
result.end = end; | |
return result; | |
} | |
template<typename T> | |
Result sum(Slice<T> data_slice, Slice<uint32_t> indices, size_t elements_to_read) { | |
if (indices.is_empty()) | |
return sum(data_slice, elements_to_read); | |
Result result; | |
size_t num_indices = indices.size(); | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (;;) { | |
if (elements_to_read > num_indices) { | |
result += sum(data_slice, indices); | |
elements_to_read -= num_indices; | |
} | |
else { | |
if (elements_to_read > 0) { | |
auto begin = indices.begin(); | |
auto end = begin + elements_to_read; | |
auto indices_subslice = Slice<uint32_t>(begin, end); | |
result += sum(data_slice, indices_subslice); | |
} | |
break; | |
} | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
result.start = start; | |
result.end = end; | |
return result; | |
} | |
template<typename T, typename OP> | |
void run(ofstream& json, std::string type_name, Slice<T> full_data, Slice<uint32_t> indices, Slice<size_t> block_sizes, Slice<size_t> bytes_to_read_slice, Slice<size_t> num_threads, size_t num_loops, OP op){ | |
if (!block_sizes.is_empty() && block_sizes.size() != bytes_to_read_slice.size()) | |
std::abort(); | |
const size_t element_size = sizeof(T); | |
std::string block_str = block_sizes.is_empty() ? "Large Block" : "Small Blocks"; | |
std::string access_str = indices.is_empty() ? "Sequential Access" : "Random Access"; | |
std::string access_label = indices.is_empty() ? "seq" : "rand"; | |
json << " {" << std::endl; | |
json << " \"title\": \"" << type_name << " - " << block_str << " - " << access_str << "\"," << std::endl; | |
json << " \"lines\": [" << std::endl; | |
// For each block_size | |
for (auto idx = 0; idx < bytes_to_read_slice.size(); ++idx) { | |
const auto bytes_to_read = bytes_to_read_slice[idx]; | |
const auto block_size = block_sizes.is_empty() ? full_data.size() : block_sizes[idx]; | |
const size_t elements_to_read = bytes_to_read / element_size; | |
bool block_per_thread = !block_sizes.is_empty(); | |
size_t label_num = block_sizes.is_empty() ? bytes_to_read : block_size; | |
if (block_per_thread) { | |
std::cout << " Block Size: " << block_sizes[idx] << std::endl; | |
} | |
json << " {" << std::endl; | |
json << " \"num_bytes\": " << bytes_to_read << "," << std::endl; | |
json << " \"label\": \"" << str_from_bytes(label_num) << " " << access_label << "\"," << std::endl; | |
json << " \"thread_times\": [" << std::endl; | |
// For 1..num_threads | |
for (auto thread_count : num_threads) { | |
// Compute data slices | |
std::vector<Slice<T>> data_slices = {}; | |
std::vector<Slice<uint32_t>> index_slices = {}; | |
std::vector<uint32_t> block_indices = {}; | |
// Compute index slices | |
if (block_per_thread == false) { | |
if (indices.is_empty()) { | |
// many data slices, no indices | |
data_slices = generate_slices(full_data, thread_count); | |
} | |
else if (!indices.is_empty()) { | |
// one data slice, many indices slices | |
data_slices = { full_data }; | |
index_slices = generate_slices<uint32_t>(indices, thread_count); | |
} | |
} | |
else { | |
const size_t elements_per_block = block_size / element_size; | |
if ( indices.is_empty()) { | |
// many data slices, no indices | |
data_slices = generate_slices(full_data, thread_count, elements_per_block); | |
} | |
else if (!indices.is_empty()) { | |
// many data slices, new indices | |
data_slices = generate_slices(full_data, thread_count, elements_per_block); | |
auto len = data_slices[0].size(); | |
block_indices = generate_shuffled_indices<uint32_t>(len); | |
index_slices = { block_indices }; | |
} | |
} | |
std::cout << std::flush << " " << thread_count << " Threads: "; | |
json << " ["; // thread entry | |
// Run loop_num times | |
int64_t total_time = 0; | |
Result result; | |
for (auto loop_num = 0; loop_num < num_loops; ++loop_num) { | |
// Create threads | |
std::vector<std::future<Result>> futures; | |
std::atomic_bool spin_block = true; | |
std::atomic_size_t latch = thread_count; | |
for (size_t thread = 0; thread < thread_count; ++thread) { | |
auto data_slice = data_slices.size() > 1 ? data_slices[thread] : data_slices[0]; | |
Slice<uint32_t> index_slice = index_slices.empty() ? Slice<uint32_t>() : (index_slices.size() > 1 ? index_slices[thread] : index_slices[0]); | |
// Compute elements to read for this thread | |
// Thread 0 gets remainder | |
size_t elements_to_read_for_thread = elements_to_read / thread_count; | |
if (thread == 0) | |
elements_to_read_for_thread += elements_to_read % thread_count; | |
futures.push_back(std::async(std::launch::async, [data_slice, index_slice, op, elements_to_read_for_thread, &latch, &spin_block](){ | |
// Signal Ready | |
--latch; | |
// Spin until told to go | |
while (spin_block) {}; | |
// Do work | |
return op(data_slice, index_slice, elements_to_read_for_thread); | |
})); | |
} | |
// Wait until all threads are ready to go | |
while (latch > 0) {}; | |
// Release the kraken! | |
spin_block = false; | |
// Run to complete | |
auto time_start = high_resolution_clock::now(); | |
for (auto& future : futures) | |
result += future.get(); | |
auto elapsed_ns = ns_since(time_start); | |
// Print results | |
total_time += elapsed_ns; | |
std::cout << ns_to_ms(elapsed_ns) << " " << std::flush; | |
json << elapsed_ns; | |
if (loop_num != num_loops - 1) | |
json << ", "; | |
} | |
double avg_ns = (double)total_time / num_loops; | |
double avg_ms = avg_ns / MILLION; | |
double gigabytes_per_sec = ((double)bytes_to_read / GIGA) / (avg_ns / BILLION); | |
result.bytes_touched /= num_loops; | |
result.ops /= num_loops; | |
std::cout << " Speed: (" << gigabytes_per_sec << ") AvgMS: (" << avg_ms << ") BytesTouched: (" << result.bytes_touched << ") Ops: (" << result.ops << ") Sum: (" << result.sum << ")" << std::endl; | |
if constexpr (COUNT_EXTRA) { | |
if (result.bytes_touched != bytes_to_read) { | |
std::cout << "Expected to touch [" << bytes_to_read << "] bytes but actually touched [" << result.bytes_touched << "]" << std::endl; | |
} | |
} | |
json << "]," << std::endl; // close thread entry | |
} | |
json << " ]" << std::endl; // close thread_times | |
json << " }," << std::endl; // close line entry | |
std::cout << std::endl; | |
} | |
json << " ]" << std::endl; // close lines | |
json << " }," << std::endl; // close plot | |
} | |
template<typename T> | |
void run_suite( | |
ofstream & json, | |
std::string type_str, | |
size_t num_loops, | |
size_t num_threads, | |
size_t bytes_to_generate, | |
Slice<size_t> block_sizes, | |
Slice<size_t> bytes_to_read_seq, | |
Slice<size_t> bytes_to_read_rand) | |
{ | |
std::cout << "-----------------------\n" << type_str << "\n----------------------" << std::endl; | |
// Generate shared data | |
std::vector<size_t> num_generated_bytes = { bytes_to_generate }; | |
Slice<size_t> num_generated_bytes_slice = { num_generated_bytes }; | |
std::vector<T> elements = generate_bytes<T>(bytes_to_generate, []() { return generate_random_element<T>(); }); | |
auto elements_slice = Slice<T>(elements); | |
std::cout << "Generating indices ..." << std::flush; | |
auto time_start = high_resolution_clock::now(); | |
std::vector<uint32_t> indices = generate_shuffled_indices<uint32_t>(elements.size()); | |
auto indices_slice = Slice<uint32_t>(indices); | |
std::cout << " ... completed in " << ms_since(time_start) << " ms" << std::endl << std::endl; | |
std::vector<size_t> threads; | |
for (size_t i = 0; i < num_threads; ++i) | |
threads.push_back(i + 1); | |
auto threads_slice = Slice<size_t>(threads); | |
auto op_sum = [](Slice<T> slice, Slice<uint32_t> indices, size_t num_loops) { | |
return ::sum(slice, indices, num_loops); | |
}; | |
//auto op_inc_sum = [](Slice<T> slice, Slice<uint32_t> indices, size_t num_loops) { | |
// return ::inc_sum(slice, indices, num_loops); | |
//}; | |
std::vector<size_t> four_gb{ GIGA * 4 }; | |
Slice<size_t> slice_four_gb(four_gb); | |
std::vector<size_t> one_gb{ GIGA }; | |
Slice<size_t> slice_one_gb(one_gb); | |
// data=4gb, read=4gb, threads=1..N, access=seq, op=sum | |
if (true) { | |
std::string graph_title = type_str + " - Large Block - Sequential Access"; | |
std::cout << graph_title << std::endl; | |
run(json, type_str, elements_slice, Slice<uint32_t>(), Slice<size_t>(), slice_four_gb, threads_slice, num_loops, op_sum); | |
} | |
// data=4gb, read=4gb, threads=1..N, access=rng, op=sum | |
if (true) { | |
std::cout << "Large Block - Random Access" << std::endl; | |
run(json, type_str, elements_slice, indices_slice, Slice<size_t>(), one_gb, threads_slice, num_loops, op_sum); | |
} | |
// data=L1..RAM, read=4gb, threads=1..N, access=seq, op=sum | |
if (true) { | |
std::cout << "Small Blocks - Sequential Access" << std::endl; | |
run(json, type_str, elements_slice, Slice<uint32_t>(), block_sizes, bytes_to_read_seq, threads_slice, num_loops, op_sum); | |
} | |
// data=L1..RAM, read=4gb, threads=1..N, access=rng, op=sum | |
if (true) { | |
std::cout << "Small Blocks - Random Access" << std::endl; | |
run(json, type_str, elements_slice, indices_slice, block_sizes, bytes_to_read_rand, threads_slice, num_loops, op_sum); | |
} | |
if (true) { | |
} | |
} | |
int main() | |
{ | |
std::string filename = "data.json"; | |
std::ofstream json(filename, std::ios::binary); | |
json << "plots = [" << std::endl; | |
// Default values | |
size_t bytes_to_generate = GIGA; | |
size_t num_loops = 12; | |
size_t num_threads = 12; | |
constexpr size_t BLOCK_COUNT = 8; | |
std::array<size_t, BLOCK_COUNT> block_sizes_arr = { L1_CORE / 2, L1_CORE, L2_CORE / 2, L2_CORE, L3_CORE / 2, L3_CORE, L3_CORE * 12, MEGA * 64 }; | |
auto block_sizes = Slice<size_t>::from_array<>(block_sizes_arr); | |
std::array<size_t, BLOCK_COUNT> bytes_to_read_seq_arr = { GIGA * 32, GIGA * 32, GIGA * 16, GIGA * 16, GIGA * 8, GIGA * 8, GIGA * 4, GIGA * 4 }; // for sequential access | |
auto bytes_to_read_seq = Slice<size_t>::from_array<>(bytes_to_read_seq_arr); | |
std::array<size_t, BLOCK_COUNT> bytes_to_read_rand_arr = { GIGA * 4, GIGA * 4, GIGA * 2, GIGA * 2, GIGA * 1, GIGA * 1, GIGA / 2, GIGA / 2 }; // for random access | |
auto bytes_to_read_rand = Slice<size_t>::from_array<>(bytes_to_read_rand_arr); | |
if (true) { | |
run_suite<int>(json, "int32", num_loops, num_threads, bytes_to_generate, block_sizes, bytes_to_read_seq, bytes_to_read_rand); | |
} | |
if (true) { | |
run_suite<matrix4x4>(json, "matrix4x4", num_loops, num_threads, bytes_to_generate, block_sizes, bytes_to_read_seq, bytes_to_read_rand); | |
} | |
if (true) { | |
run_suite<matrix4x4_simd>(json, "matrix4x4_simd", num_loops, num_threads, bytes_to_generate, block_sizes, bytes_to_read_seq, bytes_to_read_rand); | |
} | |
if (true) { | |
// matrix4x4_unique is super slow. so reduce size/times | |
num_threads = std::min(num_threads, (size_t)6); | |
num_loops = std::min(num_loops, (size_t)6); | |
constexpr size_t seq_size = GIGA * 4; | |
std::array<size_t, BLOCK_COUNT> bytes_to_read_seq_arr = { seq_size, seq_size, seq_size, seq_size, seq_size, seq_size, seq_size, seq_size }; | |
auto bytes_to_read_seq = Slice<size_t>::from_array<>(bytes_to_read_seq_arr); | |
constexpr size_t rand_size = GIGA; | |
std::array<size_t, BLOCK_COUNT> bytes_to_read_rand_arr = { rand_size, rand_size, rand_size, rand_size, rand_size / 2, rand_size / 2, rand_size / 4, rand_size / 4 }; // for random access | |
auto bytes_to_read_rand = Slice<size_t>::from_array<>(bytes_to_read_rand_arr); | |
run_suite<unique_matrix4x4>(json, "matrix4x4_unique", num_loops, num_threads, bytes_to_generate, block_sizes, bytes_to_read_seq, bytes_to_read_rand); | |
} | |
json << "]" << std::endl; // close plots | |
json.close(); | |
} | |
Yes, that's for MSVC. The GCC/Clang equivalent is __attribute__((always_inline))
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If "__forceinline" a compiler extension? Clang complains about it.
Also: