Last active
September 12, 2018 03:50
-
-
Save lemire/7838881cc01df5023bb55a2653f793bc to your computer and use it in GitHub Desktop.
If you have M array accesses, how many cache misses can you have? What about 1.8 M cache misses...?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// $ g++ -O3 -o cachetest cachetest.cpp | |
// $ ./cachetest | |
// cycles = 349189333 (cycles per key 34.919) instructions = 190000063 (ins/key 19.000,ins/cycles 0.544) cache misses = 17385995 (misses per keys 1.739) branch misses = 6 (misses per keys 0.000) | |
// | |
// start helper header | |
// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h | |
#ifdef __linux__ | |
#include <asm/unistd.h> // for __NR_perf_event_open | |
#include <linux/perf_event.h> // for perf event constants | |
#include <sys/ioctl.h> // for ioctl | |
#include <unistd.h> // for syscall | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <cerrno> // for errno | |
#include <cstring> // for memset | |
#include <stdexcept> | |
#include <vector> | |
template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents { | |
int fd; | |
perf_event_attr attribs; | |
int num_events; | |
std::vector<uint64_t> temp_result_vec; | |
std::vector<uint64_t> ids; | |
public: | |
LinuxEvents(std::vector<int> config_vec) : fd(0) { | |
memset(&attribs, 0, sizeof(attribs)); | |
attribs.type = TYPE; | |
attribs.size = sizeof(attribs); | |
attribs.disabled = 1; | |
attribs.exclude_kernel = 1; | |
attribs.exclude_hv = 1; | |
attribs.sample_period = 0; | |
attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; | |
const int pid = 0; // the current process | |
const int cpu = -1; // all CPUs | |
const unsigned long flags = 0; | |
int group = -1; // no group | |
num_events = config_vec.size(); | |
uint32_t i = 0; | |
for (auto config : config_vec) { | |
attribs.config = config; | |
fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); | |
if (fd == -1) { | |
report_error("perf_event_open"); | |
} | |
ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]); | |
if (group == -1) { | |
group = fd; | |
} | |
} | |
temp_result_vec.resize(num_events * 2 + 1); | |
} | |
~LinuxEvents() { close(fd); } | |
inline void start() { | |
if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { | |
report_error("ioctl(PERF_EVENT_IOC_RESET)"); | |
} | |
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { | |
report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); | |
} | |
} | |
inline void end(std::vector<unsigned long long> &results) { | |
if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { | |
report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); | |
} | |
if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) { | |
report_error("read"); | |
} | |
// our actual results are in slots 1,3,5, ... of this structure | |
// we really should be checking our ids obtained earlier to be safe | |
for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { | |
results[i / 2] = temp_result_vec[i]; | |
} | |
} | |
private: | |
void report_error(const std::string &context) { | |
throw std::runtime_error(context + ": " + std::string(strerror(errno))); | |
} | |
}; | |
#endif | |
// end of header | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <iostream> | |
#include <vector> | |
using namespace std; | |
static uint64_t murmur64(uint64_t h) { | |
h ^= h >> 33; | |
h *= UINT64_C(0xff51afd7ed558ccd); | |
h ^= h >> 33; | |
h *= UINT64_C(0xc4ceb9fe1a85ec53); | |
h ^= h >> 33; | |
return h; | |
} | |
int main() { | |
const size_t N = 100000000; | |
uint64_t * array = new uint64_t[N]; | |
for(size_t i = 0; i < N; i++) array[i] = i; | |
vector<int> evts; | |
evts.push_back(PERF_COUNT_HW_CPU_CYCLES); | |
evts.push_back(PERF_COUNT_HW_INSTRUCTIONS); | |
evts.push_back(PERF_COUNT_HW_CACHE_MISSES); | |
evts.push_back(PERF_COUNT_HW_BRANCH_MISSES); | |
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts); | |
vector<unsigned long long> results; | |
results.resize(evts.size()); | |
cout << endl; | |
int seed = rand(); | |
unified.start(); | |
size_t M = 10000000; | |
size_t count = 0; | |
for(uint64_t j = 0; j < M; j++) | |
count += array[((uint32_t) murmur64(j + seed) * N)>>32 ]; | |
unified.end(results); | |
printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n", | |
(size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M, | |
(size_t)results[3], results[3] * 1.0/M); | |
printf("%d \n", (uint32_t)count); | |
uint32_t * precomp = new uint32_t[M]; | |
for(uint64_t j = 0; j < M; j++) precomp[j] = ((uint32_t) murmur64(j + seed) * N)>>32; | |
unified.start(); | |
count = 0; | |
for(uint64_t j = 0; j < M; j++) | |
count += array[precomp[j]]; | |
unified.end(results); | |
printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n", | |
(size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M, | |
(size_t)results[3], results[3] * 1.0/M); | |
printf("%d \n", (uint32_t)count); | |
delete[] precomp; | |
delete[] array; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment