Skip to content

Instantly share code, notes, and snippets.

@duarten
Last active February 13, 2017 18:08
Show Gist options
  • Save duarten/6c6a65f66ee61250be9626bea4549e54 to your computer and use it in GitHub Desktop.
Save duarten/6c6a65f66ee61250be9626bea4549e54 to your computer and use it in GitHub Desktop.
Measuring the effects of software prefetches.
#include <algorithm>
#include <atomic>
#include <chrono>
#include <iostream>
#include <random>
using namespace std::chrono_literals;
static __inline__ uint64_t rdtsc() {
uint64_t hi, lo;
__asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi));
return lo | (hi << 32);
}
template <typename T>
inline constexpr
T align_up(T v, T align) {
return (v + align - 1) & ~(align - 1);
}
template <typename T>
inline constexpr
T* align_up(T* v, size_t align) {
static_assert(sizeof(T) == 1, "align byte pointers only");
return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
}
struct data {
uint64_t value1;
uint64_t pad1[7];
uint64_t value2;
uint64_t pad2[7];
uint64_t value3;
uint64_t pad3[7];
uint64_t value4;
uint64_t pad4[7];
};
std::random_device rd;
std::mt19937 eng(rd());
std::uniform_int_distribution<> distr(0, std::numeric_limits<int>::max());
uint64_t do_some_work(uint64_t current, uint64_t next) {
int64_t x = current + next;
std::vector<int32_t> rs = { distr(eng) + distr(eng) + distr(eng) + distr(eng) };
for (auto y : rs) {
x += y;
}
std::reverse(rs.begin(), rs.end());
for (auto y : rs) {
x -= y;
}
return uint64_t(x - current);
}
uint64_t loop_body(uint32_t current, volatile data* x, const uint32_t& num_values) {
auto next = distr(eng) % num_values;
std::atomic_signal_fence(std::memory_order_seq_cst);
__builtin_prefetch((void*)&x[next], 0, 3);
//__builtin_prefetch((void*)&x[next].value2, 0, 3);
//__builtin_prefetch((void*)&x[next].value3, 0, 3);
//__builtin_prefetch((void*)&x[next].value4, 0, 3);
std::atomic_signal_fence(std::memory_order_seq_cst);
//return do_some_work(x[current].value2, next); // ~500ns
return do_some_work(x[current].value1 + x[current].value2 + x[current].value3 + x[current].value4, next); // ~500ns
}
int main() {
const uint64_t mem = 6l * 1024 * 1024 * 1024;
const uint32_t ops = 50'000'000;
auto r = reinterpret_cast<char*>(malloc(mem));
auto ar = align_up(r, 64);
uint32_t num_values = ((r + mem) - ar) / sizeof(data);
volatile data* x = (volatile data*) ar;
for (uint64_t i = 0; i < num_values; ++i) {
x[i].value1 = i;
x[i].value2 = i;
x[i].value3 = i;
x[i].value4 = i;
}
uint64_t start = rdtsc();
uint32_t current = distr(eng) % num_values;
for (uint32_t i = 0; i < ops; ++i) {
current = loop_body(current, x, num_values);
}
uint64_t stop = rdtsc();
printf("%lu mticks\n", (stop - start) / 1000000);
printf("%lu ticks/op\n", (stop - start) / ops);
return 1;
}
@duarten
Copy link
Author

duarten commented Feb 13, 2017

perf record -e L1-dcache-load-misses taskset -c 1 ./prefetch

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment