Skip to content

Instantly share code, notes, and snippets.

@Keyframe
Created November 13, 2015 01:35
Show Gist options
  • Save Keyframe/1ed9062ec52fc4a0d14b to your computer and use it in GitHub Desktop.
Save Keyframe/1ed9062ec52fc4a0d14b to your computer and use it in GitHub Desktop.
Memory bandwidth, stupid
/*
gcc -pedantic -std=c99 -Wall -Werror -Wextra -Wno-unused -I "[glfw_include_path]" -L "[glfw_lib_path]" arr_test_simd_nobuf.c -o arr_test_simd_nobuf -lglfw3 -framework OpenGL -framework Cocoa -framework IOKit -framework CoreVideo
if AVX on OSX (-mavx doesn't really work with as/homebrew combo - SO USE THIS if on OSX):
clang -pedantic -std=c99 -Wall -Werror -Wextra -Wno-unused -I "[glfw_include_path]" -L "[glfw_lib_path]" arr_test_simd_nobuf.c -o arr_test_simd_nobuf -lglfw3 -framework OpenGL -framework Cocoa -framework IOKit -framework CoreVideo -mavx
optional: -fprefetch-loop-arrays
*/
#pragma clang diagnostic ignored "-Wunused-parameter"
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h> /* http://stackoverflow.com/questions/18314523/sse-copy-avx-copy-and-stdcopy-performance */
#include <string.h> /* memcpy */
#include <inttypes.h>
#include <GLFW/glfw3.h>
#define ARRAY_NUM 1000000 * 128 /* GIG */
int main(int argc, char *argv[]) {
if(!glfwInit()) {
exit(EXIT_FAILURE);
}
int cx = 0;
char filename_stride[50];
char filename_dumb[50];
char filename_sse[50];
char filename_avx[50];
cx = snprintf(filename_stride, 50, "%lu_stride.dat",
((ARRAY_NUM*sizeof(uint64_t))/1000000));
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); }
FILE *file_stride = fopen(filename_stride, "w");
cx = snprintf(filename_dumb, 50, "%lu_dumb.dat",
((ARRAY_NUM*sizeof(uint64_t))/1000000));
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); }
FILE *file_dumb = fopen(filename_dumb, "w");
cx = snprintf(filename_sse, 50, "%lu_sse.dat",
((ARRAY_NUM*sizeof(uint64_t))/1000000));
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); }
FILE *file_sse = fopen(filename_sse, "w");
cx = snprintf(filename_avx, 50, "%lu_avx.dat",
((ARRAY_NUM*sizeof(uint64_t))/1000000));
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); }
FILE *file_avx = fopen(filename_avx, "w");
if(file_stride == NULL || file_dumb == NULL || file_sse == NULL || file_avx == NULL) {
perror("Error opening file.");
exit(EXIT_FAILURE);
}
// uint64_t *array = malloc(sizeof(uint64_t) * ARRAY_NUM);
// uint64_t *array_copy = malloc(sizeof(uint64_t) * ARRAY_NUM);
uint64_t *array = _mm_malloc(sizeof(uint64_t) * ARRAY_NUM, 16);
uint64_t *array_copy = _mm_malloc(sizeof(uint64_t) * ARRAY_NUM, 16);
double performance = 0.0;
double time_start = 0.0;
double time_end = 0.0;
double performance_min = 0.0;
double performance_max = 0.0;
const int imax = 1000;
double performance_average = 0.0;
double performance_ring[imax];
/* ======================== */
/* Init array */
/* ======================== */
printf("=== INIT ARRAY ===\n");
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
array[i] = 0xff;
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Init done in %.3f s - size of array: %lu MBs (x2)\n", (time_end - time_start), (ARRAY_NUM*sizeof(uint64_t)/1000000));
printf("Performance: %.3f MB/s\n\n", performance);
/* Linear copy */
printf("=== LINEAR COPY ===\n");
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
array_copy[i] = array[i];
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Copying (linear) done in %.3f s\n", (time_end - time_start));
printf("Performance: %.3f MB/s\n", performance);
printf("Checking array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
if(array_copy[i] != array[i]) {
printf("[%llu]Damn\r", i); fflush(stdout);
exit(EXIT_FAILURE);
}
}
printf("OK\n\n");
/* ======================== */
/* Copying with wide stride */
/* ======================== */
printf("=== COPYING WITH WIDE STRIDE ===\n");
printf("Clearing array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
array_copy[i] = 0xdd;
}
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i=i+40) {
array_copy[i] = array[i];
array_copy[i+1] = array[i+1];
array_copy[i+2] = array[i+2];
array_copy[i+3] = array[i+3];
array_copy[i+4] = array[i+4];
array_copy[i+5] = array[i+5];
array_copy[i+6] = array[i+6];
array_copy[i+7] = array[i+7];
array_copy[i+8] = array[i+8];
array_copy[i+9] = array[i+9];
array_copy[i+10] = array[i+10];
array_copy[i+11] = array[i+11];
array_copy[i+12] = array[i+12];
array_copy[i+13] = array[i+13];
array_copy[i+14] = array[i+14];
array_copy[i+15] = array[i+15];
array_copy[i+16] = array[i+16];
array_copy[i+17] = array[i+17];
array_copy[i+18] = array[i+18];
array_copy[i+19] = array[i+19];
array_copy[i+20] = array[i+20];
array_copy[i+21] = array[i+21];
array_copy[i+22] = array[i+22];
array_copy[i+23] = array[i+23];
array_copy[i+24] = array[i+24];
array_copy[i+25] = array[i+25];
array_copy[i+26] = array[i+26];
array_copy[i+27] = array[i+27];
array_copy[i+28] = array[i+28];
array_copy[i+29] = array[i+29];
array_copy[i+30] = array[i+30];
array_copy[i+31] = array[i+31];
array_copy[i+32] = array[i+32];
array_copy[i+33] = array[i+33];
array_copy[i+34] = array[i+34];
array_copy[i+35] = array[i+35];
array_copy[i+36] = array[i+36];
array_copy[i+37] = array[i+37];
array_copy[i+38] = array[i+38];
array_copy[i+39] = array[i+39];
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Copying (stride 40) done in %.3f s\n", (time_end - time_start));
printf("Performance: %.3f MB/s\n", performance);
printf("Checking array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
if(array_copy[i] != array[i]) {
printf("[%llu]Damn\n", i); fflush(stdout);
exit(EXIT_FAILURE);
}
}
printf("OK\n\n");
/* ======================== */
/* Copying SSE */
/* ======================== */
printf("=== COPYING WITH SSE ===\n");
printf("Clearing array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
array_copy[i] = 0xdd;
}
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) {
__m128i buffer = _mm_load_si128( (__m128i*)&array[i] );
_mm_store_si128( (__m128i*)&array_copy[i], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+2] );
_mm_store_si128( (__m128i*)&array_copy[i+2], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+4] );
_mm_store_si128( (__m128i*)&array_copy[i+4], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+6] );
_mm_store_si128( (__m128i*)&array_copy[i+6], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+8] );
_mm_store_si128( (__m128i*)&array_copy[i+8], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+10] );
_mm_store_si128( (__m128i*)&array_copy[i+10], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+12] );
_mm_store_si128( (__m128i*)&array_copy[i+12], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+14] );
_mm_store_si128( (__m128i*)&array_copy[i+14], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+16] );
_mm_store_si128( (__m128i*)&array_copy[i+16], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+18] );
_mm_store_si128( (__m128i*)&array_copy[i+18], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+20] );
_mm_store_si128( (__m128i*)&array_copy[i+20], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+22] );
_mm_store_si128( (__m128i*)&array_copy[i+22], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+24] );
_mm_store_si128( (__m128i*)&array_copy[i+24], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+26] );
_mm_store_si128( (__m128i*)&array_copy[i+26], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+28] );
_mm_store_si128( (__m128i*)&array_copy[i+28], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+30] );
_mm_store_si128( (__m128i*)&array_copy[i+30], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+32] );
_mm_store_si128( (__m128i*)&array_copy[i+32], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+34] );
_mm_store_si128( (__m128i*)&array_copy[i+34], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+36] );
_mm_store_si128( (__m128i*)&array_copy[i+36], buffer );
buffer = _mm_load_si128( (__m128i*)&array[i+38] );
_mm_store_si128( (__m128i*)&array_copy[i+38], buffer );
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Copying SSE done in %.3f s\n", (time_end - time_start));
printf("Performance: %.3f MB/s\n", performance);
printf("Checking array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
if(array_copy[i] != array[i]) {
printf("[%llu]Damn\n", i); fflush(stdout);
exit(EXIT_FAILURE);
}
}
printf("OK\n\n");
/* ======================== */
/* Copying AVX */
/* ======================== */
printf("=== COPYING WITH AVX ===\n");
printf("Clearing array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
array_copy[i] = 0xdd;
}
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) {
__m256i buffer = _mm256_load_si256( (__m256i*)&array[i] );
_mm256_store_si256( (__m256i*)&array_copy[i], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+4] );
_mm256_store_si256( (__m256i*)&array_copy[i+4], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+8] );
_mm256_store_si256( (__m256i*)&array_copy[i+8], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+12] );
_mm256_store_si256( (__m256i*)&array_copy[i+12], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+16] );
_mm256_store_si256( (__m256i*)&array_copy[i+16], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+20] );
_mm256_store_si256( (__m256i*)&array_copy[i+20], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+24] );
_mm256_store_si256( (__m256i*)&array_copy[i+24], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+28] );
_mm256_store_si256( (__m256i*)&array_copy[i+28], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+32] );
_mm256_store_si256( (__m256i*)&array_copy[i+32], buffer );
buffer = _mm256_load_si256( (__m256i*)&array[i+36] );
_mm256_store_si256( (__m256i*)&array_copy[i+36], buffer );
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Copying AVX done in %.3f s\n", (time_end - time_start));
printf("Performance: %.3f MB/s\n", performance);
printf("Checking array copy...\n");
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
if(array_copy[i] != array[i]) {
printf("[%llu]Damn\n", i); fflush(stdout);
exit(EXIT_FAILURE);
}
}
printf("OK\n\n");
/* ======================== */
/* Reading with wide stride */
/* ======================== */
printf("=== READING WITH WIDE STRIDE ===\n");
for(int j = 0; j < imax; ++j) {
uint64_t tmp = 0;
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i=i+40) {
tmp = array[i];
if(i < (ARRAY_NUM-40)) {
tmp = array[i+1];
tmp = array[i+2];
tmp = array[i+3];
tmp = array[i+4];
tmp = array[i+5];
tmp = array[i+6];
tmp = array[i+7];
tmp = array[i+8];
tmp = array[i+9];
tmp = array[i+10];
tmp = array[i+11];
tmp = array[i+12];
tmp = array[i+13];
tmp = array[i+14];
tmp = array[i+15];
tmp = array[i+16];
tmp = array[i+17];
tmp = array[i+18];
tmp = array[i+19];
tmp = array[i+20];
tmp = array[i+21];
tmp = array[i+22];
tmp = array[i+23];
tmp = array[i+24];
tmp = array[i+25];
tmp = array[i+26];
tmp = array[i+27];
tmp = array[i+28];
tmp = array[i+29];
tmp = array[i+30];
tmp = array[i+31];
tmp = array[i+32];
tmp = array[i+33];
tmp = array[i+34];
tmp = array[i+35];
tmp = array[i+36];
tmp = array[i+37];
tmp = array[i+38];
tmp = array[i+39];
}
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
performance_average += performance;
if(performance > performance_max) { performance_max = performance; }
if(j == 0) { performance_min = performance; }
if(performance < performance_min) { performance_min = performance; }
// printf("[%d/%d] Performance stride 40: %.3f MB/s\r", j+1, imax, performance);
// fprintf(file_stride, "%d\t%f\n", j, performance);
// fflush(file_stride);
// fflush(stdout);
performance_ring[j] = performance;
}
performance_average = performance_average / imax;
printf("\nAverage: %.3f MB/s\n", performance_average);
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n",
performance_min, performance_max);
for(uint64_t i = 0; i < imax; ++i) {
fprintf(file_stride, "%lld\t%f\n", i, performance_ring[i]);
fflush(file_stride);
fflush(stdout);
}
/* ======================== */
/* SSE reading */
/* ======================== */
printf("=== READING WITH SSE ===\n");
performance_average = 0.0;
performance_min = 0.0;
performance_max = 0.0;
for(int j = 0; j < imax; ++j) {
__m128i tmp;
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) {
tmp = _mm_load_si128( (__m128i*)&array[i] );
tmp = _mm_load_si128( (__m128i*)&array[i+2] );
tmp = _mm_load_si128( (__m128i*)&array[i+4] );
tmp = _mm_load_si128( (__m128i*)&array[i+6] );
tmp = _mm_load_si128( (__m128i*)&array[i+8] );
tmp = _mm_load_si128( (__m128i*)&array[i+10] );
tmp = _mm_load_si128( (__m128i*)&array[i+12] );
tmp = _mm_load_si128( (__m128i*)&array[i+14] );
tmp = _mm_load_si128( (__m128i*)&array[i+16] );
tmp = _mm_load_si128( (__m128i*)&array[i+18] );
tmp = _mm_load_si128( (__m128i*)&array[i+20] );
tmp = _mm_load_si128( (__m128i*)&array[i+22] );
tmp = _mm_load_si128( (__m128i*)&array[i+24] );
tmp = _mm_load_si128( (__m128i*)&array[i+26] );
tmp = _mm_load_si128( (__m128i*)&array[i+28] );
tmp = _mm_load_si128( (__m128i*)&array[i+30] );
tmp = _mm_load_si128( (__m128i*)&array[i+32] );
tmp = _mm_load_si128( (__m128i*)&array[i+34] );
tmp = _mm_load_si128( (__m128i*)&array[i+36] );
tmp = _mm_load_si128( (__m128i*)&array[i+38] );
// tmp = _mm_load_si128( (__m128i*)&array[i+40] );
// tmp = _mm_load_si128( (__m128i*)&array[i+42] );
// tmp = _mm_load_si128( (__m128i*)&array[i+44] );
// tmp = _mm_load_si128( (__m128i*)&array[i+46] );
// tmp = _mm_load_si128( (__m128i*)&array[i+48] );
// tmp = _mm_load_si128( (__m128i*)&array[i+50] );
// tmp = _mm_load_si128( (__m128i*)&array[i+52] );
// tmp = _mm_load_si128( (__m128i*)&array[i+54] );
// tmp = _mm_load_si128( (__m128i*)&array[i+56] );
// tmp = _mm_load_si128( (__m128i*)&array[i+58] );
// tmp = _mm_load_si128( (__m128i*)&array[i+60] );
// tmp = _mm_load_si128( (__m128i*)&array[i+62] );
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
performance_average += performance;
if(performance > performance_max) { performance_max = performance; }
if(j == 0) { performance_min = performance; }
if(performance < performance_min) { performance_min = performance; }
// printf("[%d/%d] Performance SSE: %.3f MB/s\r", j+1, imax, performance);
// fprintf(file_sse, "%d\t%f\n", j, performance);
// fflush(file_sse);
// fflush(stdout);
performance_ring[j] = performance;
}
performance_average = performance_average / imax;
printf("\nAverage: %.3f MB/s\n", performance_average);
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n",
performance_min, performance_max);
for(uint64_t i = 0; i < imax; ++i) {
fprintf(file_sse, "%lld\t%f\n", i, performance_ring[i]);
fflush(file_sse);
fflush(stdout);
}
/* ======================== */
/* AVX reading */
/* ======================== */
printf("=== READING WITH AVX ===\n");
performance_average = 0.0;
performance_min = 0.0;
performance_max = 0.0;
for(int j = 0; j < imax; ++j) {
__m256i tmp;
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) {
tmp = _mm256_load_si256( (__m256i*)&array[i] );
tmp = _mm256_load_si256( (__m256i*)&array[i+4] );
tmp = _mm256_load_si256( (__m256i*)&array[i+8] );
tmp = _mm256_load_si256( (__m256i*)&array[i+12] );
tmp = _mm256_load_si256( (__m256i*)&array[i+16] );
tmp = _mm256_load_si256( (__m256i*)&array[i+20] );
tmp = _mm256_load_si256( (__m256i*)&array[i+24] );
tmp = _mm256_load_si256( (__m256i*)&array[i+28] );
tmp = _mm256_load_si256( (__m256i*)&array[i+32] );
tmp = _mm256_load_si256( (__m256i*)&array[i+36] );
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
performance_average += performance;
if(performance > performance_max) { performance_max = performance; }
if(j == 0) { performance_min = performance; }
if(performance < performance_min) { performance_min = performance; }
// printf("[%d/%d] Performance AVX: %.3f MB/s\r", j+1, imax, performance);
// fprintf(file_avx, "%d\t%f\n", j, performance);
// fflush(file_avx);
// fflush(stdout);
performance_ring[j] = performance;
}
performance_average = performance_average / imax;
printf("\nAverage: %.3f MB/s\n", performance_average);
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n",
performance_min, performance_max);
for(uint64_t i = 0; i < imax; ++i) {
fprintf(file_avx, "%lld\t%f\n", i, performance_ring[i]);
fflush(file_avx);
fflush(stdout);
}
/* ======================== */
/* Linear reading */
/* ======================== */
printf("=== LINEAR READING ===\n");
performance_average = 0.0;
performance_min = 0.0;
performance_max = 0.0;
for(int j = 0; j < imax; ++j) {
uint64_t tmp = 0;
performance = 0;
time_start = glfwGetTime();
for(uint64_t i = 0; i < ARRAY_NUM; ++i) {
tmp = array[i];
}
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
performance_average += performance;
if(performance > performance_max) { performance_max = performance; }
if(j == 0) { performance_min = performance; }
if(performance < performance_min) { performance_min = performance; }
// printf("[%d/%d] Performance dumb: %.3f MB/s\r", j+1, imax, performance);
// fprintf(file_dumb, "%d\t%f\n", j, performance);
// fflush(file_dumb);
// fflush(stdout);
performance_ring[j] = performance;
}
performance_average = performance_average / imax;
printf("\nAverage: %.3f MB/s\n", performance_average);
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n",
performance_min, performance_max);
for(uint64_t i = 0; i < imax; ++i) {
fprintf(file_dumb, "%lld\t%f\n", i, performance_ring[i]);
fflush(file_dumb);
fflush(stdout);
}
/* ======================== */
/* Memcpy */
/* ======================== */
printf("=== MEMCPY ===\n");
performance = 0;
time_start = glfwGetTime();
memcpy(array_copy, array, ARRAY_NUM*sizeof(uint64_t));
time_end = glfwGetTime();
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start);
printf("Copying (memcpy) done in %.3f s\n", (time_end - time_start));
printf("Performance: %.3f MB/s\n", performance);
/* ======================== */
/* Cleanup and exit */
/* ======================== */
// free(array);
// free(array_copy);
printf("\n=== CLEANUP ===\n");
_mm_free(array);
_mm_free(array_copy);
glfwTerminate();
fclose(file_avx);
fclose(file_sse);
fclose(file_dumb);
fclose(file_stride);
exit(EXIT_SUCCESS);
}
/* gnuplot:
set ylabel 'MB/s'
set xlabel 'iterations'
8-16
plot [:1000] [:12000] '8_stride.dat' title '8 MB stride' with lines, '8_dumb.dat' title '8 MB dumb' with lines, '8_sse.dat' title '8 MB SSE' with lines, '8_avx.dat' title '8 MB AVX' with lines, '16_stride.dat' title '16 MB stride' with lines, '16_dumb.dat' title '16 MB dumb' with lines, '16_sse.dat' title '16 MB SSE' with lines, '16_avx.dat' title '16 MB AVX' with lines
32-64
plot [:1000] [:12000] '32_stride.dat' title '32 MB stride' with lines, '32_dumb.dat' title '32 MB dumb' with lines, '32_sse.dat' title '32 MB SSE' with lines, '32_avx.dat' title '32 MB AVX' with lines, '64_stride.dat' title '64 MB stride' with lines, '64_dumb.dat' title '64 MB dumb' with lines, '64_sse.dat' title '64 MB SSE' with lines, '64_avx.dat' title '64 MB AVX' with lines
128-256
plot [:1000] [:12000] '128_stride.dat' title '128 MB stride' with lines, '128_dumb.dat' title '128 MB dumb' with lines, '128_sse.dat' title '128 MB SSE' with lines, '128_avx.dat' title '128 MB AVX' with lines, '256_stride.dat' title '256 MB stride' with lines, '256_dumb.dat' title '256 MB dumb' with lines, '256_sse.dat' title '256 MB SSE' with lines, '256_avx.dat' title '256 MB AVX' with lines
512-1024
plot [:1000] [:12000] '512_stride.dat' title '512 MB stride' with lines, '512_dumb.dat' title '512 MB dumb' with lines, '512_sse.dat' title '512 MB SSE' with lines, '512_avx.dat' title '512 MB AVX' with lines, '1024_stride.dat' title '1024 MB stride' with lines, '1024_dumb.dat' title '1024 MB dumb' with lines, '1024_sse.dat' title '1024 MB SSE' with lines, '1024_avx.dat' title '1024 MB AVX' with lines
all together:
unset key
plot [:1000] [:12000] '8_stride.dat' title '8 MB stride' with lines, '8_dumb.dat' title '8 MB dumb' with lines, '8_sse.dat' title '8 MB SSE' with lines, '8_avx.dat' title '8 MB AVX' with lines, '16_stride.dat' title '16 MB stride' with lines, '16_dumb.dat' title '16 MB dumb' with lines, '16_sse.dat' title '16 MB SSE' with lines, '16_avx.dat' title '16 MB AVX' with lines, '32_stride.dat' title '32 MB stride' with lines, '32_dumb.dat' title '32 MB dumb' with lines, '32_sse.dat' title '32 MB SSE' with lines, '32_avx.dat' title '32 MB AVX' with lines, '64_stride.dat' title '64 MB stride' with lines, '64_dumb.dat' title '64 MB dumb' with lines, '64_sse.dat' title '64 MB SSE' with lines, '64_avx.dat' title '64 MB AVX' with lines, '128_stride.dat' title '128 MB stride' with lines, '128_dumb.dat' title '128 MB dumb' with lines, '128_sse.dat' title '128 MB SSE' with lines, '128_avx.dat' title '128 MB AVX' with lines, '256_stride.dat' title '256 MB stride' with lines, '256_dumb.dat' title '256 MB dumb' with lines, '256_sse.dat' title '256 MB SSE' with lines, '256_avx.dat' title '256 MB AVX' with lines, '512_stride.dat' title '512 MB stride' with lines, '512_dumb.dat' title '512 MB dumb' with lines, '512_sse.dat' title '512 MB SSE' with lines, '512_avx.dat' title '512 MB AVX' with lines, '1024_stride.dat' title '1024 MB stride' with lines, '1024_dumb.dat' title '1024 MB dumb' with lines, '1024_sse.dat' title '1024 MB SSE' with lines, '1024_avx.dat' title '1024 MB AVX' with lines
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment