lemire/cachetest.cpp

## cachetest.cpp
//
// $ g++ -O3 -o cachetest cachetest.cpp
// $ ./cachetest
// cycles =  349189333 (cycles per key     34.919) instructions =  190000063 (ins/key     19.000,ins/cycles      0.544) cache misses =   17385995 (misses per keys      1.739) branch misses =          6 (misses per keys      0.000)
//
// start helper header
// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h
#ifdef __linux__

#include <asm/unistd.h>       // for __NR_perf_event_open
#include <linux/perf_event.h> // for perf event constants
#include <sys/ioctl.h>        // for ioctl
#include <unistd.h>           // for syscall
#include <stdint.h>
#include <stdlib.h>
#include <cerrno>  // for errno
#include <cstring> // for memset
#include <stdexcept>

#include <vector>

template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
  int fd;
  perf_event_attr attribs;
  int num_events;
  std::vector<uint64_t> temp_result_vec;
  std::vector<uint64_t> ids;

public:
  LinuxEvents(std::vector<int> config_vec) : fd(0) {
    memset(&attribs, 0, sizeof(attribs));
    attribs.type = TYPE;
    attribs.size = sizeof(attribs);
    attribs.disabled = 1;
    attribs.exclude_kernel = 1;
    attribs.exclude_hv = 1;

    attribs.sample_period = 0;
    attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
    const int pid = 0;  // the current process
    const int cpu = -1; // all CPUs
    const unsigned long flags = 0;

    int group = -1; // no group
    num_events = config_vec.size();
    uint32_t i = 0;
    for (auto config : config_vec) {
      attribs.config = config;
      fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
      if (fd == -1) {
        report_error("perf_event_open");
      }
      ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
      if (group == -1) {
        group = fd;
      }
    }

    temp_result_vec.resize(num_events * 2 + 1);
  }


  ~LinuxEvents() { close(fd); }

  inline void start() {
    if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
      report_error("ioctl(PERF_EVENT_IOC_RESET)");
    }

    if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
      report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
    }
  }

  inline void end(std::vector<unsigned long long> &results) {
    if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
      report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
    }

    if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) {
      report_error("read");
    }
    // our actual results are in slots 1,3,5, ... of this structure
    // we really should be checking our ids obtained earlier to be safe
    for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
      results[i / 2] = temp_result_vec[i];
    }
  }

private:
  void report_error(const std::string &context) {
    throw std::runtime_error(context + ": " + std::string(strerror(errno)));
  }
};
#endif
// end of header


#include <stdint.h>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;

static uint64_t murmur64(uint64_t h) {
    h ^= h >> 33;
    h *= UINT64_C(0xff51afd7ed558ccd);
    h ^= h >> 33;
    h *= UINT64_C(0xc4ceb9fe1a85ec53);
    h ^= h >> 33;
    return h;
}


int main() {
  const size_t N = 100000000;
  uint64_t * array = new uint64_t[N];
  for(size_t i = 0; i < N; i++) array[i] = i;

  vector<int> evts;
  evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
  evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
  evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
  evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
  LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
  vector<unsigned long long> results;
  results.resize(evts.size());
  cout << endl;
  int seed = rand();

  unified.start();
  size_t M = 10000000;
  size_t count = 0;
  for(uint64_t j = 0; j < M; j++)
    count += array[((uint32_t) murmur64(j + seed) * N)>>32 ];
  unified.end(results);
  printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
      (size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M,
      (size_t)results[3], results[3] * 1.0/M);
  printf("%d \n", (uint32_t)count);
  uint32_t * precomp = new uint32_t[M];
  for(uint64_t j = 0; j < M; j++) precomp[j] = ((uint32_t) murmur64(j + seed) * N)>>32;
  unified.start();

  count = 0;
  for(uint64_t j = 0; j < M; j++)
    count += array[precomp[j]];
  unified.end(results);

  printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
      (size_t)results[0], results[0]*1.0/M, (size_t)results[1], results[1]*1.0/M , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/M,
      (size_t)results[3], results[3] * 1.0/M);

  printf("%d \n", (uint32_t)count);
  delete[] precomp;
  delete[] array;
}
	//
	// $ g++ -O3 -o cachetest cachetest.cpp
	// $ ./cachetest
	// cycles = 349189333 (cycles per key 34.919) instructions = 190000063 (ins/key 19.000,ins/cycles 0.544) cache misses = 17385995 (misses per keys 1.739) branch misses = 6 (misses per keys 0.000)
	//
	// start helper header
	// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h
	#ifdef __linux__

	#include <asm/unistd.h> // for __NR_perf_event_open
	#include <linux/perf_event.h> // for perf event constants
	#include <sys/ioctl.h> // for ioctl
	#include <unistd.h> // for syscall
	#include <stdint.h>
	#include <stdlib.h>
	#include <cerrno> // for errno
	#include <cstring> // for memset
	#include <stdexcept>

	#include <vector>

	template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
	int fd;
	perf_event_attr attribs;
	int num_events;
	std::vector<uint64_t> temp_result_vec;
	std::vector<uint64_t> ids;

	public:
	LinuxEvents(std::vector<int> config_vec) : fd(0) {
	memset(&attribs, 0, sizeof(attribs));
	attribs.type = TYPE;
	attribs.size = sizeof(attribs);
	attribs.disabled = 1;
	attribs.exclude_kernel = 1;
	attribs.exclude_hv = 1;

	attribs.sample_period = 0;
	attribs.read_format = PERF_FORMAT_GROUP \| PERF_FORMAT_ID;
	const int pid = 0; // the current process
	const int cpu = -1; // all CPUs
	const unsigned long flags = 0;

	int group = -1; // no group
	num_events = config_vec.size();
	uint32_t i = 0;
	for (auto config : config_vec) {
	attribs.config = config;
	fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
	if (fd == -1) {
	report_error("perf_event_open");
	}
	ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
	if (group == -1) {
	group = fd;
	}
	}

	temp_result_vec.resize(num_events * 2 + 1);
	}


	~LinuxEvents() { close(fd); }

	inline void start() {
	if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
	report_error("ioctl(PERF_EVENT_IOC_RESET)");
	}

	if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
	report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
	}
	}

	inline void end(std::vector<unsigned long long> &results) {
	if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
	report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
	}

	if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) {
	report_error("read");
	}
	// our actual results are in slots 1,3,5, ... of this structure
	// we really should be checking our ids obtained earlier to be safe
	for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
	results[i / 2] = temp_result_vec[i];
	}
	}

	private:
	void report_error(const std::string &context) {
	throw std::runtime_error(context + ": " + std::string(strerror(errno)));
	}
	};
	#endif
	// end of header


	#include <stdint.h>
	#include <stdio.h>
	#include <iostream>
	#include <vector>
	using namespace std;

	static uint64_t murmur64(uint64_t h) {
	h ^= h >> 33;
	h *= UINT64_C(0xff51afd7ed558ccd);
	h ^= h >> 33;
	h *= UINT64_C(0xc4ceb9fe1a85ec53);
	h ^= h >> 33;
	return h;
	}


	int main() {
	const size_t N = 100000000;
	uint64_t * array = new uint64_t[N];
	for(size_t i = 0; i < N; i++) array[i] = i;

	vector<int> evts;
	evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
	evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
	evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
	evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
	LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
	vector<unsigned long long> results;
	results.resize(evts.size());
	cout << endl;
	int seed = rand();

	unified.start();
	size_t M = 10000000;
	size_t count = 0;
	for(uint64_t j = 0; j < M; j++)
	count += array[((uint32_t) murmur64(j + seed) * N)>>32 ];
	unified.end(results);
	printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
	(size_t)results[0], results[0]1.0/M, (size_t)results[1], results[1]1.0/M , results[1]1.0/results[0], (size_t)results[2], results[2]1.0/M,
	(size_t)results[3], results[3] * 1.0/M);
	printf("%d \n", (uint32_t)count);
	uint32_t * precomp = new uint32_t[M];
	for(uint64_t j = 0; j < M; j++) precomp[j] = ((uint32_t) murmur64(j + seed) * N)>>32;
	unified.start();

	count = 0;
	for(uint64_t j = 0; j < M; j++)
	count += array[precomp[j]];
	unified.end(results);

	printf("cycles = %10.zu (cycles per key %10.3f) instructions = %10.zu (ins/key %10.3f,ins/cycles %10.3f) cache misses = %10.zu (misses per keys %10.3f) branch misses = %10.zu (misses per keys %10.3f) \n",
	(size_t)results[0], results[0]1.0/M, (size_t)results[1], results[1]1.0/M , results[1]1.0/results[0], (size_t)results[2], results[2]1.0/M,
	(size_t)results[3], results[3] * 1.0/M);

	printf("%d \n", (uint32_t)count);
	delete[] precomp;
	delete[] array;
	}