Skip to content

Instantly share code, notes, and snippets.

@Khalefa
Last active May 17, 2020 12:27
Show Gist options
  • Save Khalefa/4dca3b97d91a6b2671efcd16ef6231a2 to your computer and use it in GitHub Desktop.
Save Khalefa/4dca3b97d91a6b2671efcd16ef6231a2 to your computer and use it in GitHub Desktop.
//g++ -march=native --std=c++17 -O3 gaps.cpp -o gaps
//sysctl -a | grep machdep.cpu.features
//sysctl -n machdep.cpu.brand_string
//https://www.cs.virginia.edu/~cr4bd/3330/F2018/simdref.html
#include <unistd.h>
#include <chrono>
#include <cstdlib>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <vector>
#ifdef __linux__
#include <sys/mman.h>
#endif
#include <immintrin.h>
static void escape(void* p) { asm volatile("" : : "g"(p) : "memory"); }
constexpr std::size_t KB = 1024;
constexpr std::size_t MB = 1024 * 1024;
constexpr std::size_t page_size = 4096;
using clk = std::chrono::high_resolution_clock;
using time_point = std::chrono::time_point<clk>;
using dur_double = std::chrono::duration<double>;
using std::chrono::duration_cast;
class Timer {
public:
Timer(size_t n, size_t block, size_t gap)
: _n{n}, _block{block}, _gap{gap}, _start{clk::now()} {};
~Timer() {
auto duration = clk::now() - _start;
auto elapsed_s = duration_cast<dur_double>(duration).count();
auto elapsed_ms = elapsed_s * 1000;
// auto n = _total_size / (_block + _gap);
// std::printf("n %d", n);
auto size = _block * _n;
// auto pages = size / page_size;
auto mbs = size / MB;
auto gb_per_sec = size / (1024. * MB) / elapsed_s;
std::printf("%lu\t %5lu MB\t%5lu\t%5lu\t%9.3f ms\t%7.2f GB/s\t\n", _n, mbs,
_block, _gap, elapsed_ms, gb_per_sec);
;
};
private:
size_t _n;
size_t _block;
size_t _gap;
time_point _start;
};
char* malloc_(size_t size) {
char* buf;
{
buf = (char*)malloc(size * sizeof(char));
for (size_t i = 0; i < size; i += page_size) buf[i] = 0;
buf[size - 1] = 0;
escape(&buf);
}
return buf;
}
void walk(const char* buf, size_t n, size_t block, size_t gap) {
auto t = Timer{n, block, gap};
size_t sum = 0;
size_t indx = 0;
for (size_t i = 0; i < n; i++) {
for (int j = 0; j < block; j++, indx++) sum += buf[indx];
indx += gap;
}
escape(&sum);
}
void walk_v2(const char* buf, size_t n, size_t block, size_t gap) {
auto t = Timer{n, block, gap};
__m256i* array = (__m256i*)buf;
int arrayB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
__m256i accum = _mm256_loadu_si256((__m256i*)arrayB);
size_t sum = 0;
size_t indx = 0;
for (size_t i = 0; i < n; i++) {
for (int j = 0; j < block / sizeof(__m256i); j++, indx++) {
// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
// http://goo.gl/P6wI4
// https://lwn.net/Articles/444336/
//
// We use PREFETCHNTA as instructed by the Intel Optimization Manual for
// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
// Really though, since we access the data linearly, the hardware
// prefetcher ought to be good enough.
_mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
accum = _mm256_add_epi32(accum, array[indx]);
}
indx += (gap / sizeof(__m256i));
}
escape(&accum);
}
//avx2
void walk_v3(const char* buf, size_t n, size_t block, size_t gap) {
auto t = Timer{n, block, gap};
__m512i* array = (__m512i*)buf;
int arrayB[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
__m512i accum = _mm512_loadu_si512((__m512i*)arrayB);
size_t sum = 0;
size_t indx = 0;
for (size_t i = 0; i < n; i++) {
for (int j = 0; j < block / sizeof(__m512i); j++, indx++) {
// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx
// http://goo.gl/P6wI4
// https://lwn.net/Articles/444336/
//
// We use PREFETCHNTA as instructed by the Intel Optimization Manual for
// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq).
// Really though, since we access the data linearly, the hardware
// prefetcher ought to be good enough.
_mm_prefetch(&array[indx + 2], _MM_HINT_NTA);
accum = _mm512_add_epi32(accum, array[indx]);
}
indx += (gap / sizeof(__m512i));
}
escape(&accum);
}
#if 0
int main() {
std::vector<int> gaps{0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
char* buf = malloc_(2024 * MB);
for (size_t gap : gaps) {
for (size_t blocksize = 256 * KB; blocksize <= 1024 * MB; blocksize *= 2) {
int n = 2024 * MB / (blocksize + gap);
walk(buf, n, blocksize, gap);
// std::cout << '\n';
}
}
long sz = sysconf(_SC_PAGESIZE);
std::cout << sz << "\n";
}
#endif
#define WALK(bs, g) walk_v2(buf, size / (bs + g), bs, g);
#define WALK_gap(g) \
WALK(4 * KB, g); \
WALK(8 * KB, g); \
WALK(16 * KB, g); \
WALK(64 * KB, g); \
WALK(128 * KB, g); \
WALK(256 * KB, g); \
WALK(512 * KB, g); \
WALK(MB, g); \
WALK(2 * MB, g); \
WALK(4 * MB, g); \
WALK(8 * MB, g); \
WALK(16 * MB, g); \
WALK(64 * MB, g);
#if 1
int main() {
size_t size = 2 * 1024 * MB;
char* buf = malloc_(size);
// void * aligned_alloc (size_t alignment, size_t size)
std::printf("%p\n", buf);
int gap = 0;
WALK_gap(0);
WALK_gap(256);
WALK_gap(256 * 16);
WALK_gap(256 * 16 * 16);
// long sz = sysconf(_SC_PAGESIZE);
// std::cout << sz << "\n";
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment