Skip to content

Instantly share code, notes, and snippets.

@lemire
Last active September 12, 2018 03:50
Show Gist options
  • Save lemire/7838881cc01df5023bb55a2653f793bc to your computer and use it in GitHub Desktop.
Save lemire/7838881cc01df5023bb55a2653f793bc to your computer and use it in GitHub Desktop.
If you have M array accesses, how many cache misses can you have? What about 1.8 M cache misses...?
//
// $ g++ -O3 -o cachetest cachetest.cpp
// $ ./cachetest
// cycles = 349189333 (cycles per key 34.919) instructions = 190000063 (ins/key 19.000,ins/cycles 0.544) cache misses = 17385995 (misses per keys 1.739) branch misses = 6 (misses per keys 0.000)
//
// start helper header
// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h
#ifdef __linux__
#include <asm/unistd.h> // for __NR_perf_event_open
#include <linux/perf_event.h> // for perf event constants
#include <sys/ioctl.h> // for ioctl
#include <unistd.h> // for syscall
#include <stdint.h>
#include <stdlib.h>
#include <cerrno> // for errno
#include <cstring> // for memset
#include <stdexcept>
#include <vector>
template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
int fd;
perf_event_attr attribs;
int num_events;
std::vector<uint64_t> temp_result_vec;
std::vector<uint64_t> ids;
public:
LinuxEvents(std::vector<int> config_vec) : fd(0) {
memset(&attribs, 0, sizeof(attribs));
attribs.type = TYPE;
attribs.size = sizeof(attribs);
attribs.disabled = 1;
attribs.exclude_kernel = 1;
attribs.exclude_hv = 1;
attribs.sample_period = 0;
attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
const int pid = 0; // the current process
const int cpu = -1; // all CPUs
const unsigned long flags = 0;
int group = -1; // no group
num_events = config_vec.size();
uint32_t i = 0;
for (auto config : config_vec) {
attribs.config = config;
fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
if (fd == -1) {
report_error("perf_event_open");
}
ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
if (group == -1) {
group = fd;
}
}
temp_result_vec.resize(num_events * 2 + 1);
}
~LinuxEvents() { close(fd); }
inline void start() {
if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_RESET)");
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
}
}
inline void end(std::vector<unsigned long long> &results) {
if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
}
if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) {
report_error("read");
}
// our actual results are in slots 1,3,5, ... of this structure
// we really should be checking our ids obtained earlier to be safe
for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
results[i / 2] = temp_result_vec[i];
}
}
private:
void report_error(const std::string &context) {
throw std::runtime_error(context + ": " + std::string(strerror(errno)));
}
};
#endif
// end of header
#include <stdint.h>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
static uint64_t murmur64(uint64_t h) {
h ^= h >> 33;
h *= UINT64_C(0xff51afd7ed558ccd);
h ^= h >> 33;
h *= UINT64_C(0xc4ceb9fe1a85ec53);
h ^= h >> 33;
return h;
}
int main() {
const size_t N = 100000000;
uint64_t * array = new uint64_t[N];
for(size_t i = 0; i < N; i++) array[i] = i;
vector<int> evts;
evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
vector<unsigned long long> results;
results.resize(evts.size());
cout << endl;
int seed = rand();
unified.start();
size_t M = 10000000;
size_t count = 0;
for(uint64_t j = 0; j < M; j++)
count += array[((uint32_t) murmur64(j + seed) * N)>>32 ];
unified.end(results);
printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
(size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M,
(size_t)results[3], results[3] * 1.0/M);
printf("%d \n", (uint32_t)count);
uint32_t * precomp = new uint32_t[M];
for(uint64_t j = 0; j < M; j++) precomp[j] = ((uint32_t) murmur64(j + seed) * N)>>32;
unified.start();
count = 0;
for(uint64_t j = 0; j < M; j++)
count += array[precomp[j]];
unified.end(results);
printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
(size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M,
(size_t)results[3], results[3] * 1.0/M);
printf("%d \n", (uint32_t)count);
delete[] precomp;
delete[] array;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment