Created
November 13, 2015 01:35
-
-
Save Keyframe/1ed9062ec52fc4a0d14b to your computer and use it in GitHub Desktop.
Memory bandwidth, stupid
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
gcc -pedantic -std=c99 -Wall -Werror -Wextra -Wno-unused -I "[glfw_include_path]" -L "[glfw_lib_path]" arr_test_simd_nobuf.c -o arr_test_simd_nobuf -lglfw3 -framework OpenGL -framework Cocoa -framework IOKit -framework CoreVideo | |
if AVX on OSX (-mavx doesn't really work with as/homebrew combo - SO USE THIS if on OSX): | |
clang -pedantic -std=c99 -Wall -Werror -Wextra -Wno-unused -I "[glfw_include_path]" -L "[glfw_lib_path]" arr_test_simd_nobuf.c -o arr_test_simd_nobuf -lglfw3 -framework OpenGL -framework Cocoa -framework IOKit -framework CoreVideo -mavx | |
optional: -fprefetch-loop-arrays | |
*/ | |
#pragma clang diagnostic ignored "-Wunused-parameter" | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <x86intrin.h> /* http://stackoverflow.com/questions/18314523/sse-copy-avx-copy-and-stdcopy-performance */ | |
#include <string.h> /* memcpy */ | |
#include <inttypes.h> | |
#include <GLFW/glfw3.h> | |
#define ARRAY_NUM 1000000 * 128 /* GIG */ | |
int main(int argc, char *argv[]) { | |
if(!glfwInit()) { | |
exit(EXIT_FAILURE); | |
} | |
int cx = 0; | |
char filename_stride[50]; | |
char filename_dumb[50]; | |
char filename_sse[50]; | |
char filename_avx[50]; | |
cx = snprintf(filename_stride, 50, "%lu_stride.dat", | |
((ARRAY_NUM*sizeof(uint64_t))/1000000)); | |
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); } | |
FILE *file_stride = fopen(filename_stride, "w"); | |
cx = snprintf(filename_dumb, 50, "%lu_dumb.dat", | |
((ARRAY_NUM*sizeof(uint64_t))/1000000)); | |
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); } | |
FILE *file_dumb = fopen(filename_dumb, "w"); | |
cx = snprintf(filename_sse, 50, "%lu_sse.dat", | |
((ARRAY_NUM*sizeof(uint64_t))/1000000)); | |
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); } | |
FILE *file_sse = fopen(filename_sse, "w"); | |
cx = snprintf(filename_avx, 50, "%lu_avx.dat", | |
((ARRAY_NUM*sizeof(uint64_t))/1000000)); | |
if(cx < 0 || cx >50) { exit(EXIT_FAILURE); } | |
FILE *file_avx = fopen(filename_avx, "w"); | |
if(file_stride == NULL || file_dumb == NULL || file_sse == NULL || file_avx == NULL) { | |
perror("Error opening file."); | |
exit(EXIT_FAILURE); | |
} | |
// uint64_t *array = malloc(sizeof(uint64_t) * ARRAY_NUM); | |
// uint64_t *array_copy = malloc(sizeof(uint64_t) * ARRAY_NUM); | |
uint64_t *array = _mm_malloc(sizeof(uint64_t) * ARRAY_NUM, 16); | |
uint64_t *array_copy = _mm_malloc(sizeof(uint64_t) * ARRAY_NUM, 16); | |
double performance = 0.0; | |
double time_start = 0.0; | |
double time_end = 0.0; | |
double performance_min = 0.0; | |
double performance_max = 0.0; | |
const int imax = 1000; | |
double performance_average = 0.0; | |
double performance_ring[imax]; | |
/* ======================== */ | |
/* Init array */ | |
/* ======================== */ | |
printf("=== INIT ARRAY ===\n"); | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
array[i] = 0xff; | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Init done in %.3f s - size of array: %lu MBs (x2)\n", (time_end - time_start), (ARRAY_NUM*sizeof(uint64_t)/1000000)); | |
printf("Performance: %.3f MB/s\n\n", performance); | |
/* Linear copy */ | |
printf("=== LINEAR COPY ===\n"); | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
array_copy[i] = array[i]; | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Copying (linear) done in %.3f s\n", (time_end - time_start)); | |
printf("Performance: %.3f MB/s\n", performance); | |
printf("Checking array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
if(array_copy[i] != array[i]) { | |
printf("[%llu]Damn\r", i); fflush(stdout); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("OK\n\n"); | |
/* ======================== */ | |
/* Copying with wide stride */ | |
/* ======================== */ | |
printf("=== COPYING WITH WIDE STRIDE ===\n"); | |
printf("Clearing array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
array_copy[i] = 0xdd; | |
} | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i=i+40) { | |
array_copy[i] = array[i]; | |
array_copy[i+1] = array[i+1]; | |
array_copy[i+2] = array[i+2]; | |
array_copy[i+3] = array[i+3]; | |
array_copy[i+4] = array[i+4]; | |
array_copy[i+5] = array[i+5]; | |
array_copy[i+6] = array[i+6]; | |
array_copy[i+7] = array[i+7]; | |
array_copy[i+8] = array[i+8]; | |
array_copy[i+9] = array[i+9]; | |
array_copy[i+10] = array[i+10]; | |
array_copy[i+11] = array[i+11]; | |
array_copy[i+12] = array[i+12]; | |
array_copy[i+13] = array[i+13]; | |
array_copy[i+14] = array[i+14]; | |
array_copy[i+15] = array[i+15]; | |
array_copy[i+16] = array[i+16]; | |
array_copy[i+17] = array[i+17]; | |
array_copy[i+18] = array[i+18]; | |
array_copy[i+19] = array[i+19]; | |
array_copy[i+20] = array[i+20]; | |
array_copy[i+21] = array[i+21]; | |
array_copy[i+22] = array[i+22]; | |
array_copy[i+23] = array[i+23]; | |
array_copy[i+24] = array[i+24]; | |
array_copy[i+25] = array[i+25]; | |
array_copy[i+26] = array[i+26]; | |
array_copy[i+27] = array[i+27]; | |
array_copy[i+28] = array[i+28]; | |
array_copy[i+29] = array[i+29]; | |
array_copy[i+30] = array[i+30]; | |
array_copy[i+31] = array[i+31]; | |
array_copy[i+32] = array[i+32]; | |
array_copy[i+33] = array[i+33]; | |
array_copy[i+34] = array[i+34]; | |
array_copy[i+35] = array[i+35]; | |
array_copy[i+36] = array[i+36]; | |
array_copy[i+37] = array[i+37]; | |
array_copy[i+38] = array[i+38]; | |
array_copy[i+39] = array[i+39]; | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Copying (stride 40) done in %.3f s\n", (time_end - time_start)); | |
printf("Performance: %.3f MB/s\n", performance); | |
printf("Checking array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
if(array_copy[i] != array[i]) { | |
printf("[%llu]Damn\n", i); fflush(stdout); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("OK\n\n"); | |
/* ======================== */ | |
/* Copying SSE */ | |
/* ======================== */ | |
printf("=== COPYING WITH SSE ===\n"); | |
printf("Clearing array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
array_copy[i] = 0xdd; | |
} | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) { | |
__m128i buffer = _mm_load_si128( (__m128i*)&array[i] ); | |
_mm_store_si128( (__m128i*)&array_copy[i], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+2] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+2], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+4] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+4], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+6] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+6], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+8] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+8], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+10] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+10], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+12] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+12], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+14] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+14], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+16] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+16], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+18] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+18], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+20] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+20], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+22] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+22], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+24] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+24], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+26] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+26], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+28] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+28], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+30] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+30], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+32] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+32], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+34] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+34], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+36] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+36], buffer ); | |
buffer = _mm_load_si128( (__m128i*)&array[i+38] ); | |
_mm_store_si128( (__m128i*)&array_copy[i+38], buffer ); | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Copying SSE done in %.3f s\n", (time_end - time_start)); | |
printf("Performance: %.3f MB/s\n", performance); | |
printf("Checking array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
if(array_copy[i] != array[i]) { | |
printf("[%llu]Damn\n", i); fflush(stdout); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("OK\n\n"); | |
/* ======================== */ | |
/* Copying AVX */ | |
/* ======================== */ | |
printf("=== COPYING WITH AVX ===\n"); | |
printf("Clearing array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
array_copy[i] = 0xdd; | |
} | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) { | |
__m256i buffer = _mm256_load_si256( (__m256i*)&array[i] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+4] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+4], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+8] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+8], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+12] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+12], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+16] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+16], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+20] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+20], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+24] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+24], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+28] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+28], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+32] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+32], buffer ); | |
buffer = _mm256_load_si256( (__m256i*)&array[i+36] ); | |
_mm256_store_si256( (__m256i*)&array_copy[i+36], buffer ); | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Copying AVX done in %.3f s\n", (time_end - time_start)); | |
printf("Performance: %.3f MB/s\n", performance); | |
printf("Checking array copy...\n"); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
if(array_copy[i] != array[i]) { | |
printf("[%llu]Damn\n", i); fflush(stdout); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("OK\n\n"); | |
/* ======================== */ | |
/* Reading with wide stride */ | |
/* ======================== */ | |
printf("=== READING WITH WIDE STRIDE ===\n"); | |
for(int j = 0; j < imax; ++j) { | |
uint64_t tmp = 0; | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i=i+40) { | |
tmp = array[i]; | |
if(i < (ARRAY_NUM-40)) { | |
tmp = array[i+1]; | |
tmp = array[i+2]; | |
tmp = array[i+3]; | |
tmp = array[i+4]; | |
tmp = array[i+5]; | |
tmp = array[i+6]; | |
tmp = array[i+7]; | |
tmp = array[i+8]; | |
tmp = array[i+9]; | |
tmp = array[i+10]; | |
tmp = array[i+11]; | |
tmp = array[i+12]; | |
tmp = array[i+13]; | |
tmp = array[i+14]; | |
tmp = array[i+15]; | |
tmp = array[i+16]; | |
tmp = array[i+17]; | |
tmp = array[i+18]; | |
tmp = array[i+19]; | |
tmp = array[i+20]; | |
tmp = array[i+21]; | |
tmp = array[i+22]; | |
tmp = array[i+23]; | |
tmp = array[i+24]; | |
tmp = array[i+25]; | |
tmp = array[i+26]; | |
tmp = array[i+27]; | |
tmp = array[i+28]; | |
tmp = array[i+29]; | |
tmp = array[i+30]; | |
tmp = array[i+31]; | |
tmp = array[i+32]; | |
tmp = array[i+33]; | |
tmp = array[i+34]; | |
tmp = array[i+35]; | |
tmp = array[i+36]; | |
tmp = array[i+37]; | |
tmp = array[i+38]; | |
tmp = array[i+39]; | |
} | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
performance_average += performance; | |
if(performance > performance_max) { performance_max = performance; } | |
if(j == 0) { performance_min = performance; } | |
if(performance < performance_min) { performance_min = performance; } | |
// printf("[%d/%d] Performance stride 40: %.3f MB/s\r", j+1, imax, performance); | |
// fprintf(file_stride, "%d\t%f\n", j, performance); | |
// fflush(file_stride); | |
// fflush(stdout); | |
performance_ring[j] = performance; | |
} | |
performance_average = performance_average / imax; | |
printf("\nAverage: %.3f MB/s\n", performance_average); | |
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n", | |
performance_min, performance_max); | |
for(uint64_t i = 0; i < imax; ++i) { | |
fprintf(file_stride, "%lld\t%f\n", i, performance_ring[i]); | |
fflush(file_stride); | |
fflush(stdout); | |
} | |
/* ======================== */ | |
/* SSE reading */ | |
/* ======================== */ | |
printf("=== READING WITH SSE ===\n"); | |
performance_average = 0.0; | |
performance_min = 0.0; | |
performance_max = 0.0; | |
for(int j = 0; j < imax; ++j) { | |
__m128i tmp; | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) { | |
tmp = _mm_load_si128( (__m128i*)&array[i] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+2] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+4] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+6] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+8] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+10] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+12] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+14] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+16] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+18] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+20] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+22] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+24] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+26] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+28] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+30] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+32] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+34] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+36] ); | |
tmp = _mm_load_si128( (__m128i*)&array[i+38] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+40] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+42] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+44] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+46] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+48] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+50] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+52] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+54] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+56] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+58] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+60] ); | |
// tmp = _mm_load_si128( (__m128i*)&array[i+62] ); | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
performance_average += performance; | |
if(performance > performance_max) { performance_max = performance; } | |
if(j == 0) { performance_min = performance; } | |
if(performance < performance_min) { performance_min = performance; } | |
// printf("[%d/%d] Performance SSE: %.3f MB/s\r", j+1, imax, performance); | |
// fprintf(file_sse, "%d\t%f\n", j, performance); | |
// fflush(file_sse); | |
// fflush(stdout); | |
performance_ring[j] = performance; | |
} | |
performance_average = performance_average / imax; | |
printf("\nAverage: %.3f MB/s\n", performance_average); | |
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n", | |
performance_min, performance_max); | |
for(uint64_t i = 0; i < imax; ++i) { | |
fprintf(file_sse, "%lld\t%f\n", i, performance_ring[i]); | |
fflush(file_sse); | |
fflush(stdout); | |
} | |
/* ======================== */ | |
/* AVX reading */ | |
/* ======================== */ | |
printf("=== READING WITH AVX ===\n"); | |
performance_average = 0.0; | |
performance_min = 0.0; | |
performance_max = 0.0; | |
for(int j = 0; j < imax; ++j) { | |
__m256i tmp; | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; i += 40) { | |
tmp = _mm256_load_si256( (__m256i*)&array[i] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+4] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+8] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+12] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+16] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+20] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+24] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+28] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+32] ); | |
tmp = _mm256_load_si256( (__m256i*)&array[i+36] ); | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
performance_average += performance; | |
if(performance > performance_max) { performance_max = performance; } | |
if(j == 0) { performance_min = performance; } | |
if(performance < performance_min) { performance_min = performance; } | |
// printf("[%d/%d] Performance AVX: %.3f MB/s\r", j+1, imax, performance); | |
// fprintf(file_avx, "%d\t%f\n", j, performance); | |
// fflush(file_avx); | |
// fflush(stdout); | |
performance_ring[j] = performance; | |
} | |
performance_average = performance_average / imax; | |
printf("\nAverage: %.3f MB/s\n", performance_average); | |
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n", | |
performance_min, performance_max); | |
for(uint64_t i = 0; i < imax; ++i) { | |
fprintf(file_avx, "%lld\t%f\n", i, performance_ring[i]); | |
fflush(file_avx); | |
fflush(stdout); | |
} | |
/* ======================== */ | |
/* Linear reading */ | |
/* ======================== */ | |
printf("=== LINEAR READING ===\n"); | |
performance_average = 0.0; | |
performance_min = 0.0; | |
performance_max = 0.0; | |
for(int j = 0; j < imax; ++j) { | |
uint64_t tmp = 0; | |
performance = 0; | |
time_start = glfwGetTime(); | |
for(uint64_t i = 0; i < ARRAY_NUM; ++i) { | |
tmp = array[i]; | |
} | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
performance_average += performance; | |
if(performance > performance_max) { performance_max = performance; } | |
if(j == 0) { performance_min = performance; } | |
if(performance < performance_min) { performance_min = performance; } | |
// printf("[%d/%d] Performance dumb: %.3f MB/s\r", j+1, imax, performance); | |
// fprintf(file_dumb, "%d\t%f\n", j, performance); | |
// fflush(file_dumb); | |
// fflush(stdout); | |
performance_ring[j] = performance; | |
} | |
performance_average = performance_average / imax; | |
printf("\nAverage: %.3f MB/s\n", performance_average); | |
printf("Performance MIN: %3.f MB/s | Performance MAX: %3.f MB/s\n\n", | |
performance_min, performance_max); | |
for(uint64_t i = 0; i < imax; ++i) { | |
fprintf(file_dumb, "%lld\t%f\n", i, performance_ring[i]); | |
fflush(file_dumb); | |
fflush(stdout); | |
} | |
/* ======================== */ | |
/* Memcpy */ | |
/* ======================== */ | |
printf("=== MEMCPY ===\n"); | |
performance = 0; | |
time_start = glfwGetTime(); | |
memcpy(array_copy, array, ARRAY_NUM*sizeof(uint64_t)); | |
time_end = glfwGetTime(); | |
performance = ((ARRAY_NUM * sizeof(uint64_t))/1000000) / (time_end - time_start); | |
printf("Copying (memcpy) done in %.3f s\n", (time_end - time_start)); | |
printf("Performance: %.3f MB/s\n", performance); | |
/* ======================== */ | |
/* Cleanup and exit */ | |
/* ======================== */ | |
// free(array); | |
// free(array_copy); | |
printf("\n=== CLEANUP ===\n"); | |
_mm_free(array); | |
_mm_free(array_copy); | |
glfwTerminate(); | |
fclose(file_avx); | |
fclose(file_sse); | |
fclose(file_dumb); | |
fclose(file_stride); | |
exit(EXIT_SUCCESS); | |
} | |
/* gnuplot: | |
set ylabel 'MB/s' | |
set xlabel 'iterations' | |
8-16 | |
plot [:1000] [:12000] '8_stride.dat' title '8 MB stride' with lines, '8_dumb.dat' title '8 MB dumb' with lines, '8_sse.dat' title '8 MB SSE' with lines, '8_avx.dat' title '8 MB AVX' with lines, '16_stride.dat' title '16 MB stride' with lines, '16_dumb.dat' title '16 MB dumb' with lines, '16_sse.dat' title '16 MB SSE' with lines, '16_avx.dat' title '16 MB AVX' with lines | |
32-64 | |
plot [:1000] [:12000] '32_stride.dat' title '32 MB stride' with lines, '32_dumb.dat' title '32 MB dumb' with lines, '32_sse.dat' title '32 MB SSE' with lines, '32_avx.dat' title '32 MB AVX' with lines, '64_stride.dat' title '64 MB stride' with lines, '64_dumb.dat' title '64 MB dumb' with lines, '64_sse.dat' title '64 MB SSE' with lines, '64_avx.dat' title '64 MB AVX' with lines | |
128-256 | |
plot [:1000] [:12000] '128_stride.dat' title '128 MB stride' with lines, '128_dumb.dat' title '128 MB dumb' with lines, '128_sse.dat' title '128 MB SSE' with lines, '128_avx.dat' title '128 MB AVX' with lines, '256_stride.dat' title '256 MB stride' with lines, '256_dumb.dat' title '256 MB dumb' with lines, '256_sse.dat' title '256 MB SSE' with lines, '256_avx.dat' title '256 MB AVX' with lines | |
512-1024 | |
plot [:1000] [:12000] '512_stride.dat' title '512 MB stride' with lines, '512_dumb.dat' title '512 MB dumb' with lines, '512_sse.dat' title '512 MB SSE' with lines, '512_avx.dat' title '512 MB AVX' with lines, '1024_stride.dat' title '1024 MB stride' with lines, '1024_dumb.dat' title '1024 MB dumb' with lines, '1024_sse.dat' title '1024 MB SSE' with lines, '1024_avx.dat' title '1024 MB AVX' with lines | |
all together: | |
unset key | |
plot [:1000] [:12000] '8_stride.dat' title '8 MB stride' with lines, '8_dumb.dat' title '8 MB dumb' with lines, '8_sse.dat' title '8 MB SSE' with lines, '8_avx.dat' title '8 MB AVX' with lines, '16_stride.dat' title '16 MB stride' with lines, '16_dumb.dat' title '16 MB dumb' with lines, '16_sse.dat' title '16 MB SSE' with lines, '16_avx.dat' title '16 MB AVX' with lines, '32_stride.dat' title '32 MB stride' with lines, '32_dumb.dat' title '32 MB dumb' with lines, '32_sse.dat' title '32 MB SSE' with lines, '32_avx.dat' title '32 MB AVX' with lines, '64_stride.dat' title '64 MB stride' with lines, '64_dumb.dat' title '64 MB dumb' with lines, '64_sse.dat' title '64 MB SSE' with lines, '64_avx.dat' title '64 MB AVX' with lines, '128_stride.dat' title '128 MB stride' with lines, '128_dumb.dat' title '128 MB dumb' with lines, '128_sse.dat' title '128 MB SSE' with lines, '128_avx.dat' title '128 MB AVX' with lines, '256_stride.dat' title '256 MB stride' with lines, '256_dumb.dat' title '256 MB dumb' with lines, '256_sse.dat' title '256 MB SSE' with lines, '256_avx.dat' title '256 MB AVX' with lines, '512_stride.dat' title '512 MB stride' with lines, '512_dumb.dat' title '512 MB dumb' with lines, '512_sse.dat' title '512 MB SSE' with lines, '512_avx.dat' title '512 MB AVX' with lines, '1024_stride.dat' title '1024 MB stride' with lines, '1024_dumb.dat' title '1024 MB dumb' with lines, '1024_sse.dat' title '1024 MB SSE' with lines, '1024_avx.dat' title '1024 MB AVX' with lines | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment