Last active
May 17, 2020 12:27
-
-
Save Khalefa/4dca3b97d91a6b2671efcd16ef6231a2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//g++ -march=native --std=c++17 -O3 gaps.cpp -o gaps | |
//sysctl -a | grep machdep.cpu.features | |
//sysctl -n machdep.cpu.brand_string | |
//https://www.cs.virginia.edu/~cr4bd/3330/F2018/simdref.html | |
#include <unistd.h> | |
#include <chrono> | |
#include <cstdlib> | |
#include <cstring> | |
#include <iomanip> | |
#include <iostream> | |
#include <vector> | |
#ifdef __linux__ | |
#include <sys/mman.h> | |
#endif | |
#include <immintrin.h> | |
static void escape(void* p) { asm volatile("" : : "g"(p) : "memory"); } | |
constexpr std::size_t KB = 1024; | |
constexpr std::size_t MB = 1024 * 1024; | |
constexpr std::size_t page_size = 4096; | |
using clk = std::chrono::high_resolution_clock; | |
using time_point = std::chrono::time_point<clk>; | |
using dur_double = std::chrono::duration<double>; | |
using std::chrono::duration_cast; | |
class Timer { | |
public: | |
Timer(size_t n, size_t block, size_t gap) | |
: _n{n}, _block{block}, _gap{gap}, _start{clk::now()} {}; | |
~Timer() { | |
auto duration = clk::now() - _start; | |
auto elapsed_s = duration_cast<dur_double>(duration).count(); | |
auto elapsed_ms = elapsed_s * 1000; | |
// auto n = _total_size / (_block + _gap); | |
// std::printf("n %d", n); | |
auto size = _block * _n; | |
// auto pages = size / page_size; | |
auto mbs = size / MB; | |
auto gb_per_sec = size / (1024. * MB) / elapsed_s; | |
std::printf("%lu\t %5lu MB\t%5lu\t%5lu\t%9.3f ms\t%7.2f GB/s\t\n", _n, mbs, | |
_block, _gap, elapsed_ms, gb_per_sec); | |
; | |
}; | |
private: | |
size_t _n; | |
size_t _block; | |
size_t _gap; | |
time_point _start; | |
}; | |
char* malloc_(size_t size) { | |
char* buf; | |
{ | |
buf = (char*)malloc(size * sizeof(char)); | |
for (size_t i = 0; i < size; i += page_size) buf[i] = 0; | |
buf[size - 1] = 0; | |
escape(&buf); | |
} | |
return buf; | |
} | |
void walk(const char* buf, size_t n, size_t block, size_t gap) { | |
auto t = Timer{n, block, gap}; | |
size_t sum = 0; | |
size_t indx = 0; | |
for (size_t i = 0; i < n; i++) { | |
for (int j = 0; j < block; j++, indx++) sum += buf[indx]; | |
indx += gap; | |
} | |
escape(&sum); | |
} | |
void walk_v2(const char* buf, size_t n, size_t block, size_t gap) { | |
auto t = Timer{n, block, gap}; | |
__m256i* array = (__m256i*)buf; | |
int arrayB[8] = {0, 0, 0, 0, 0, 0, 0, 0}; | |
__m256i accum = _mm256_loadu_si256((__m256i*)arrayB); | |
size_t sum = 0; | |
size_t indx = 0; | |
for (size_t i = 0; i < n; i++) { | |
for (int j = 0; j < block / sizeof(__m256i); j++, indx++) { | |
// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx | |
// http://goo.gl/P6wI4 | |
// https://lwn.net/Articles/444336/ | |
// | |
// We use PREFETCHNTA as instructed by the Intel Optimization Manual for | |
// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq). | |
// Really though, since we access the data linearly, the hardware | |
// prefetcher ought to be good enough. | |
_mm_prefetch(&array[indx + 2], _MM_HINT_NTA); | |
accum = _mm256_add_epi32(accum, array[indx]); | |
} | |
indx += (gap / sizeof(__m256i)); | |
} | |
escape(&accum); | |
} | |
//avx2 | |
void walk_v3(const char* buf, size_t n, size_t block, size_t gap) { | |
auto t = Timer{n, block, gap}; | |
__m512i* array = (__m512i*)buf; | |
int arrayB[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
__m512i accum = _mm512_loadu_si512((__m512i*)arrayB); | |
size_t sum = 0; | |
size_t indx = 0; | |
for (size_t i = 0; i < n; i++) { | |
for (int j = 0; j < block / sizeof(__m512i); j++, indx++) { | |
// http://msdn.microsoft.com/en-us/library/cyxt4d09(v=vs.71).aspx | |
// http://goo.gl/P6wI4 | |
// https://lwn.net/Articles/444336/ | |
// | |
// We use PREFETCHNTA as instructed by the Intel Optimization Manual for | |
// when the algorithm is single pass (Page 7-2 of http://goo.gl/M3Vaq). | |
// Really though, since we access the data linearly, the hardware | |
// prefetcher ought to be good enough. | |
_mm_prefetch(&array[indx + 2], _MM_HINT_NTA); | |
accum = _mm512_add_epi32(accum, array[indx]); | |
} | |
indx += (gap / sizeof(__m512i)); | |
} | |
escape(&accum); | |
} | |
#if 0 | |
int main() { | |
std::vector<int> gaps{0, 1, 2, 4, 8, 16, 32, 64, 128, 256}; | |
char* buf = malloc_(2024 * MB); | |
for (size_t gap : gaps) { | |
for (size_t blocksize = 256 * KB; blocksize <= 1024 * MB; blocksize *= 2) { | |
int n = 2024 * MB / (blocksize + gap); | |
walk(buf, n, blocksize, gap); | |
// std::cout << '\n'; | |
} | |
} | |
long sz = sysconf(_SC_PAGESIZE); | |
std::cout << sz << "\n"; | |
} | |
#endif | |
#define WALK(bs, g) walk_v2(buf, size / (bs + g), bs, g); | |
#define WALK_gap(g) \ | |
WALK(4 * KB, g); \ | |
WALK(8 * KB, g); \ | |
WALK(16 * KB, g); \ | |
WALK(64 * KB, g); \ | |
WALK(128 * KB, g); \ | |
WALK(256 * KB, g); \ | |
WALK(512 * KB, g); \ | |
WALK(MB, g); \ | |
WALK(2 * MB, g); \ | |
WALK(4 * MB, g); \ | |
WALK(8 * MB, g); \ | |
WALK(16 * MB, g); \ | |
WALK(64 * MB, g); | |
#if 1 | |
int main() { | |
size_t size = 2 * 1024 * MB; | |
char* buf = malloc_(size); | |
// void * aligned_alloc (size_t alignment, size_t size) | |
std::printf("%p\n", buf); | |
int gap = 0; | |
WALK_gap(0); | |
WALK_gap(256); | |
WALK_gap(256 * 16); | |
WALK_gap(256 * 16 * 16); | |
// long sz = sysconf(_SC_PAGESIZE); | |
// std::cout << sz << "\n"; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment