Khalefa/gaps.cpp

## gaps.cpp
//g++ -march=native --std=c++17 -O3 gaps.cpp -o gaps

//sysctl -a | grep machdep.cpu.features
//sysctl -n machdep.cpu.brand_string

//https://www.cs.virginia.edu/~cr4bd/3330/F2018/simdref.html
#include <unistd.h>

#include <chrono>
#include <cstdlib>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <vector>
#ifdef __linux__
#include <sys/mman.h>
#endif

#include <immintrin.h>

static void escape(void* p) { asm volatile("" : : "g"(p) : "memory"); }

constexpr std::size_t KB = 1024;
constexpr std::size_t MB = 1024 * 1024;
constexpr std::size_t page_size = 4096;

using clk = std::chrono::high_resolution_clock;
using time_point = std::chrono::time_point<clk>;
using dur_double = std::chrono::duration<double>;
using std::chrono::duration_cast;

class Timer {
 public:
  Timer(size_t n, size_t block, size_t gap)
      : _n{n}, _block{block}, _gap{gap}, _start{clk::now()} {};

  ~Timer() {
    auto duration = clk::now() - _start;
    auto elapsed_s = duration_cast<dur_double>(duration).count();
    auto elapsed_ms = elapsed_s * 1000;
    // auto n = _total_size / (_block + _gap);
    // std::printf("n %d", n);
    auto size = _block * _n;
    // auto pages = size / page_size;
    auto mbs = size / MB;
    auto gb_per_sec = size / (1024. * MB) / elapsed_s;
    std::printf("%lu\t %5lu MB\t%5lu\t%5lu\t%9.3f ms\t%7.2f GB/s\t\n", _n, mbs,
                _block, _gap, elapsed_ms, gb_per_sec);
    ;
  };

 private:
  size_t _n;
  size_t _block;
  size_t _gap;

  time_point _start;
};

char* malloc_(size_t size) {
  char* buf;
  {
    buf = (char*)malloc(size * sizeof(char));
    for (size_t i = 0; i < size; i += page_size) buf[i] = 0;
    buf[size - 1] = 0;
    escape(&buf);
  }
  return buf;
}

void walk(const char* buf, size_t n, size_t block, size_t gap) {
  auto t = Timer{n, block, gap};

  size_t sum = 0;
  size_t indx = 0;

  for (size_t i = 0; i < n; i++) {
    for (int j = 0; j < block; j++, indx++) sum += buf[indx];
    indx += gap;
  }

  escape(&sum);
}

void walk_v2(const char* buf, size_t n, size_t block, size_t gap) {
  auto t = Timer{n, block, gap};
  __m256i* array = (__m256i*)buf;

  int arrayB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
  __m256i accum = _mm256_loadu_si256((__m256i*)arrayB);
  size_t sum = 0;
  size_t indx = 0;

  for (size_t i = 0; i < n; i++) {
    for (int j = 0; j < block / sizeof(__m256i); j++, indx++) {
      // http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
      // http://goo.gl/P6wI4
      // https://lwn.net/Articles/444336/
      //
      // We use PREFETCHNTA as instructed by the Intel Optimization Manual for
      // when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
      // Really though, since we access the data linearly, the hardware
      // prefetcher ought to be good enough.
      _mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
      accum = _mm256_add_epi32(accum, array[indx]);
    }

    indx += (gap / sizeof(__m256i));
  }

  escape(&accum);
}
//avx2
void walk_v3(const char* buf, size_t n, size_t block, size_t gap) {
  auto t = Timer{n, block, gap};
  __m512i* array = (__m512i*)buf;

  int arrayB[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  __m512i accum = _mm512_loadu_si512((__m512i*)arrayB);
  size_t sum = 0;
  size_t indx = 0;

  for (size_t i = 0; i < n; i++) {
    for (int j = 0; j < block / sizeof(__m512i); j++, indx++) {
      // http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
      // http://goo.gl/P6wI4
      // https://lwn.net/Articles/444336/
      //
      // We use PREFETCHNTA as instructed by the Intel Optimization Manual for
      // when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
      // Really though, since we access the data linearly, the hardware
      // prefetcher ought to be good enough.
      _mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
      accum = _mm512_add_epi32(accum, array[indx]);
    }

    indx += (gap / sizeof(__m512i));
  }

  escape(&accum);
}
#if 0
int main() {
  std::vector<int> gaps{0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
  char* buf = malloc_(2024 * MB);

  for (size_t gap : gaps) {
    for (size_t blocksize = 256 * KB; blocksize <= 1024 * MB; blocksize *= 2) {
      int n = 2024 * MB / (blocksize + gap);
      walk(buf, n, blocksize, gap);
      // std::cout << '\n';
    }
  }

  long sz = sysconf(_SC_PAGESIZE);
  std::cout << sz << "\n";
}
#endif

#define WALK(bs, g) walk_v2(buf, size / (bs + g), bs, g);

#define WALK_gap(g)  \
  WALK(4 * KB, g);   \
  WALK(8 * KB, g);   \
  WALK(16 * KB, g);  \
  WALK(64 * KB, g);  \
  WALK(128 * KB, g); \
  WALK(256 * KB, g); \
  WALK(512 * KB, g); \
  WALK(MB, g);       \
  WALK(2 * MB, g);   \
  WALK(4 * MB, g);   \
  WALK(8 * MB, g);   \
  WALK(16 * MB, g);  \
  WALK(64 * MB, g);

#if 1
int main() {
  size_t size = 2 * 1024 * MB;

  char* buf = malloc_(size);
  // void * aligned_alloc (size_t alignment, size_t size)

  std::printf("%p\n", buf);
  int gap = 0;

  WALK_gap(0);

  WALK_gap(256);

  WALK_gap(256 * 16);

  WALK_gap(256 * 16 * 16);
  // long sz = sysconf(_SC_PAGESIZE);
  // std::cout << sz << "\n";
}
#endif
	//g++ -march=native --std=c++17 -O3 gaps.cpp -o gaps

	//sysctl -a \| grep machdep.cpu.features
	//sysctl -n machdep.cpu.brand_string

	//https://www.cs.virginia.edu/~cr4bd/3330/F2018/simdref.html
	#include <unistd.h>

	#include <chrono>
	#include <cstdlib>
	#include <cstring>
	#include <iomanip>
	#include <iostream>
	#include <vector>
	#ifdef __linux__
	#include <sys/mman.h>
	#endif

	#include <immintrin.h>

	static void escape(void* p) { asm volatile("" : : "g"(p) : "memory"); }

	constexpr std::size_t KB = 1024;
	constexpr std::size_t MB = 1024 * 1024;
	constexpr std::size_t page_size = 4096;

	using clk = std::chrono::high_resolution_clock;
	using time_point = std::chrono::time_point<clk>;
	using dur_double = std::chrono::duration<double>;
	using std::chrono::duration_cast;

	class Timer {
	public:
	Timer(size_t n, size_t block, size_t gap)
	: _n{n}, _block{block}, _gap{gap}, _start{clk::now()} {};

	~Timer() {
	auto duration = clk::now() - _start;
	auto elapsed_s = duration_cast<dur_double>(duration).count();
	auto elapsed_ms = elapsed_s * 1000;
	// auto n = _total_size / (_block + _gap);
	// std::printf("n %d", n);
	auto size = _block * _n;
	// auto pages = size / page_size;
	auto mbs = size / MB;
	auto gb_per_sec = size / (1024. * MB) / elapsed_s;
	std::printf("%lu\t %5lu MB\t%5lu\t%5lu\t%9.3f ms\t%7.2f GB/s\t\n", _n, mbs,
	_block, _gap, elapsed_ms, gb_per_sec);
	;
	};

	private:
	size_t _n;
	size_t _block;
	size_t _gap;

	time_point _start;
	};

	char* malloc_(size_t size) {
	char* buf;
	{
	buf = (char)malloc(size sizeof(char));
	for (size_t i = 0; i < size; i += page_size) buf[i] = 0;
	buf[size - 1] = 0;
	escape(&buf);
	}
	return buf;
	}

	void walk(const char* buf, size_t n, size_t block, size_t gap) {
	auto t = Timer{n, block, gap};

	size_t sum = 0;
	size_t indx = 0;

	for (size_t i = 0; i < n; i++) {
	for (int j = 0; j < block; j++, indx++) sum += buf[indx];
	indx += gap;
	}

	escape(&sum);
	}

	void walk_v2(const char* buf, size_t n, size_t block, size_t gap) {
	auto t = Timer{n, block, gap};
	__m256i* array = (__m256i*)buf;

	int arrayB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
	__m256i accum = _mm256_loadu_si256((__m256i*)arrayB);
	size_t sum = 0;
	size_t indx = 0;

	for (size_t i = 0; i < n; i++) {
	for (int j = 0; j < block / sizeof(__m256i); j++, indx++) {
	// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
	// http://goo.gl/P6wI4
	// https://lwn.net/Articles/444336/
	//
	// We use PREFETCHNTA as instructed by the Intel Optimization Manual for
	// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
	// Really though, since we access the data linearly, the hardware
	// prefetcher ought to be good enough.
	_mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
	accum = _mm256_add_epi32(accum, array[indx]);
	}

	indx += (gap / sizeof(__m256i));
	}

	escape(&accum);
	}
	//avx2
	void walk_v3(const char* buf, size_t n, size_t block, size_t gap) {
	auto t = Timer{n, block, gap};
	__m512i* array = (__m512i*)buf;

	int arrayB[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	__m512i accum = _mm512_loadu_si512((__m512i*)arrayB);
	size_t sum = 0;
	size_t indx = 0;

	for (size_t i = 0; i < n; i++) {
	for (int j = 0; j < block / sizeof(__m512i); j++, indx++) {
	// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
	// http://goo.gl/P6wI4
	// https://lwn.net/Articles/444336/
	//
	// We use PREFETCHNTA as instructed by the Intel Optimization Manual for
	// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
	// Really though, since we access the data linearly, the hardware
	// prefetcher ought to be good enough.
	_mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
	accum = _mm512_add_epi32(accum, array[indx]);
	}

	indx += (gap / sizeof(__m512i));
	}

	escape(&accum);
	}
	#if 0
	int main() {
	std::vector<int> gaps{0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
	char* buf = malloc_(2024 * MB);

	for (size_t gap : gaps) {
	for (size_t blocksize = 256 * KB; blocksize <= 1024 * MB; blocksize *= 2) {
	int n = 2024 * MB / (blocksize + gap);
	walk(buf, n, blocksize, gap);
	// std::cout << '\n';
	}
	}

	long sz = sysconf(_SC_PAGESIZE);
	std::cout << sz << "\n";
	}
	#endif

	#define WALK(bs, g) walk_v2(buf, size / (bs + g), bs, g);

	#define WALK_gap(g) \
	WALK(4 * KB, g); \
	WALK(8 * KB, g); \
	WALK(16 * KB, g); \
	WALK(64 * KB, g); \
	WALK(128 * KB, g); \
	WALK(256 * KB, g); \
	WALK(512 * KB, g); \
	WALK(MB, g); \
	WALK(2 * MB, g); \
	WALK(4 * MB, g); \
	WALK(8 * MB, g); \
	WALK(16 * MB, g); \
	WALK(64 * MB, g);

	#if 1
	int main() {
	size_t size = 2 * 1024 * MB;

	char* buf = malloc_(size);
	// void * aligned_alloc (size_t alignment, size_t size)

	std::printf("%p\n", buf);
	int gap = 0;

	WALK_gap(0);

	WALK_gap(256);

	WALK_gap(256 * 16);

	WALK_gap(256 * 16 * 16);
	// long sz = sysconf(_SC_PAGESIZE);
	// std::cout << sz << "\n";
	}
	#endif