Last active
February 13, 2017 18:08
-
-
Save duarten/6c6a65f66ee61250be9626bea4549e54 to your computer and use it in GitHub Desktop.
Measuring the effects of software prefetches.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <atomic> | |
#include <chrono> | |
#include <iostream> | |
#include <random> | |
using namespace std::chrono_literals; | |
static __inline__ uint64_t rdtsc() { | |
uint64_t hi, lo; | |
__asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi)); | |
return lo | (hi << 32); | |
} | |
template <typename T> | |
inline constexpr | |
T align_up(T v, T align) { | |
return (v + align - 1) & ~(align - 1); | |
} | |
template <typename T> | |
inline constexpr | |
T* align_up(T* v, size_t align) { | |
static_assert(sizeof(T) == 1, "align byte pointers only"); | |
return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align)); | |
} | |
struct data { | |
uint64_t value1; | |
uint64_t pad1[7]; | |
uint64_t value2; | |
uint64_t pad2[7]; | |
uint64_t value3; | |
uint64_t pad3[7]; | |
uint64_t value4; | |
uint64_t pad4[7]; | |
}; | |
std::random_device rd; | |
std::mt19937 eng(rd()); | |
std::uniform_int_distribution<> distr(0, std::numeric_limits<int>::max()); | |
uint64_t do_some_work(uint64_t current, uint64_t next) { | |
int64_t x = current + next; | |
std::vector<int32_t> rs = { distr(eng) + distr(eng) + distr(eng) + distr(eng) }; | |
for (auto y : rs) { | |
x += y; | |
} | |
std::reverse(rs.begin(), rs.end()); | |
for (auto y : rs) { | |
x -= y; | |
} | |
return uint64_t(x - current); | |
} | |
uint64_t loop_body(uint32_t current, volatile data* x, const uint32_t& num_values) { | |
auto next = distr(eng) % num_values; | |
std::atomic_signal_fence(std::memory_order_seq_cst); | |
__builtin_prefetch((void*)&x[next], 0, 3); | |
//__builtin_prefetch((void*)&x[next].value2, 0, 3); | |
//__builtin_prefetch((void*)&x[next].value3, 0, 3); | |
//__builtin_prefetch((void*)&x[next].value4, 0, 3); | |
std::atomic_signal_fence(std::memory_order_seq_cst); | |
//return do_some_work(x[current].value2, next); // ~500ns | |
return do_some_work(x[current].value1 + x[current].value2 + x[current].value3 + x[current].value4, next); // ~500ns | |
} | |
int main() { | |
const uint64_t mem = 6l * 1024 * 1024 * 1024; | |
const uint32_t ops = 50'000'000; | |
auto r = reinterpret_cast<char*>(malloc(mem)); | |
auto ar = align_up(r, 64); | |
uint32_t num_values = ((r + mem) - ar) / sizeof(data); | |
volatile data* x = (volatile data*) ar; | |
for (uint64_t i = 0; i < num_values; ++i) { | |
x[i].value1 = i; | |
x[i].value2 = i; | |
x[i].value3 = i; | |
x[i].value4 = i; | |
} | |
uint64_t start = rdtsc(); | |
uint32_t current = distr(eng) % num_values; | |
for (uint32_t i = 0; i < ops; ++i) { | |
current = loop_body(current, x, num_values); | |
} | |
uint64_t stop = rdtsc(); | |
printf("%lu mticks\n", (stop - start) / 1000000); | |
printf("%lu ticks/op\n", (stop - start) / ops); | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
perf record -e L1-dcache-load-misses taskset -c 1 ./prefetch