Created
September 15, 2013 14:46
-
-
Save aktau/6571379 to your computer and use it in GitHub Desktop.
Benchmark of writing to memory in C (clang and gcc), SSE stream vs SSE store vs naive...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <assert.h> | |
/** | |
* compile and run: | |
* | |
* OSX: | |
* clang stream.c -o stream -std=c11 -O3 -g -ftree-vectorize -fslp-vectorize -march=native && ./stream | |
* gcc-4.8 stream.c -o stream -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops && ./stream | |
* | |
* linux: | |
* clang stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -fslp-vectorize -march=native && ./stream | |
* gcc-4.8 stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -march=native && ./stream | |
* | |
* to generate the assembly: | |
* gcc-4.8 -S stream.c -o stream.s -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops -fverbose-asm -masm=intel | |
* gobjdump -dS stream > stream.obj.s | |
* | |
* clang is the (very clear) winner here, the SLP vectorizer is absolutely killer, it even turns the | |
* plain naive loop into something hyper-performant | |
*/ | |
/* posix headers */ | |
#include <sys/time.h> | |
/* intrinsics */ | |
#include <x86intrin.h> | |
#define ARRAY_SIZE(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x]))))) | |
/** | |
* some stats from my system | |
* | |
* sudo sysctl -a | grep cache | |
* | |
* hw.cachelinesize = 64 | |
* hw.l1icachesize = 32768 | |
* hw.l1dcachesize = 32768 | |
* hw.l2cachesize = 262144 | |
* hw.l3cachesize = 6291456 | |
*/ | |
/* most processors these days (2013) have a 64 byte cache line */ | |
#define FACTOR 1024 | |
#define CACHE_LINE 64 | |
#define FLOATS_PER_LINE (CACHE_LINE / sizeof(float)) | |
#define L1_CACHE_BYTES 32768 | |
#define L2_CACHE_BYTES 262144 | |
#define L3_CACHE_BYTES 6291456 | |
#ifdef __MACH__ | |
#include <mach/mach_time.h> | |
double ns_conversion_factor; | |
double us_conversion_factor; | |
double ms_conversion_factor; | |
void timeinit() { | |
mach_timebase_info_data_t timebase; | |
mach_timebase_info(&timebase); | |
ns_conversion_factor = (double)timebase.numer / (double)timebase.denom; | |
us_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000; | |
ms_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000000; | |
} | |
double nsticks() { | |
return mach_absolute_time() * ns_conversion_factor; | |
} | |
double msticks() { | |
return mach_absolute_time() * ms_conversion_factor; | |
} | |
#else | |
void timeinit() { | |
/* do nothing */ | |
} | |
double nsticks() { | |
timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return ((double)ts.tv_sec) / 1000000000 + ((double)ts.tv_nsec); | |
} | |
double msticks() { | |
timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return ((double)ts.tv_sec) / 1000 + ((double)ts.tv_nsec) * 1000000; | |
} | |
#endif | |
void *aligned_malloc(size_t size, size_t alignment) { | |
void *pa, *ptr; | |
pa = malloc((size+alignment-1)+sizeof(void *)); | |
if (!pa) return NULL; | |
ptr=(void*)( ((intptr_t)pa+sizeof(void *)+alignment-1)&~(alignment-1) ); | |
*((void **)ptr-1)=pa; | |
return ptr; | |
} | |
void aligned_free(void *ptr) { | |
if (ptr) free(*((void **)ptr-1)); | |
} | |
void pollute_cache(uint8_t volatile *arr, size_t length) { | |
for (int i = 0; i < length; ++i) { | |
arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55; | |
} | |
} | |
void pollute_cache_standalone() { | |
const size_t pollute_len = 2 * L3_CACHE_BYTES; | |
uint8_t *arr = aligned_malloc(pollute_len * sizeof(uint8_t), 64); | |
for (int i = 0; i < pollute_len; ++i) { | |
arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55; | |
} | |
aligned_free(arr); | |
} | |
/** | |
* returns the time passed, in milliseconds | |
*/ | |
double tim(const char *name, double baseline, void (*pre)(void), void (*func)(float *, size_t), float * restrict arr, size_t length) __attribute__ ((noinline)); | |
double tim(const char *name, double baseline, void (*pre)(void), void (*func)(float *, size_t), float * restrict arr, size_t length) { | |
struct timeval t1, t2; | |
if (pre) pre(); | |
const double ms1 = msticks(); | |
func(arr, length); | |
const double ms2 = msticks(); | |
const double ms = (ms2 - ms1); | |
if (baseline == -2.0) return ms; | |
/* first run, equal to baseline (itself) by definition */ | |
if (baseline == -1.0) baseline = ms; | |
if (baseline != 0.0) { | |
fprintf(stderr, "%7.0f%% (%10.5f ms) : %s\n", (ms / baseline) * 100, ms, name); | |
} | |
else { | |
fprintf(stderr, "%7.3f ms : %s\n", ms, name); | |
} | |
return ms; | |
} | |
void func0(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func0(float * const restrict arr, size_t length) { | |
memset(arr, 0x05, length * sizeof(float)); | |
} | |
#ifdef __MACH__ | |
void funcB(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void funcB(float * const restrict arr, size_t length) { | |
const float val = 5.0f; | |
memset_pattern4(arr, &val, length * sizeof(float)); | |
} | |
#endif | |
void func1(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func1(float * const restrict arr, size_t length) { | |
for (int i = 0; i < length; ++i) { | |
arr[i] = 5.0f; | |
} | |
} | |
void func2(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func2(float * const restrict arr, size_t length) { | |
for(int i = 0; i < length; i += 4) { | |
arr[i] = 5.0f; | |
arr[i+1] = 5.0f; | |
arr[i+2] = 5.0f; | |
arr[i+3] = 5.0f; | |
} | |
} | |
void func3(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func3(float * const restrict arr, size_t length) { | |
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 4) { | |
_mm_stream_ps(&arr[i], buf); | |
} | |
_mm_mfence(); | |
} | |
void func4(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func4(float * const restrict arr, size_t length) { | |
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 16) { | |
_mm_stream_ps(&arr[i + 0], buf); | |
_mm_stream_ps(&arr[i + 4], buf); | |
_mm_stream_ps(&arr[i + 8], buf); | |
_mm_stream_ps(&arr[i + 12], buf); | |
} | |
_mm_mfence(); | |
} | |
void func5(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func5(float * const restrict arr, size_t length) { | |
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 4) { | |
_mm_store_ps(&arr[i], buf); | |
} | |
} | |
void fstore_prefetch(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void fstore_prefetch(float * const restrict arr, size_t length) { | |
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 16) { | |
__builtin_prefetch(&arr[i + FLOATS_PER_LINE * 32], 1, 0); | |
_mm_store_ps(&arr[i + 0], buf); | |
_mm_store_ps(&arr[i + 4], buf); | |
_mm_store_ps(&arr[i + 8], buf); | |
_mm_store_ps(&arr[i + 12], buf); | |
} | |
} | |
void func6(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func6(float * const restrict arr, size_t length) { | |
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 16) { | |
_mm_store_ps(&arr[i + 0], buf); | |
_mm_store_ps(&arr[i + 4], buf); | |
_mm_store_ps(&arr[i + 8], buf); | |
_mm_store_ps(&arr[i + 12], buf); | |
} | |
} | |
void fstore_int(float * const restrict _arr, size_t length) __attribute__ ((noinline)); | |
void fstore_int(float * const restrict _arr, size_t length) { | |
assert(sizeof(float) == sizeof(int)); | |
const float origf = 5.0f; | |
const int raw = *(int *)&origf; | |
const size_t veclength = length / (sizeof(__m128i) / sizeof(float)); | |
const __m128i buf = _mm_set1_epi32(raw); | |
__m128i * const restrict arr = (__m128i *) _arr; | |
for (size_t i = 0; i < veclength; i += 4) { | |
_mm_store_si128(&arr[i + 0], buf); | |
_mm_store_si128(&arr[i + 1], buf); | |
_mm_store_si128(&arr[i + 2], buf); | |
_mm_store_si128(&arr[i + 3], buf); | |
} | |
} | |
void fmemset_emulate(float * const restrict _arr, size_t length) __attribute__ ((noinline)); | |
void fmemset_emulate(float * const restrict _arr, size_t length) { | |
/* these are all dummy instructions, to be edited in the assembly pass */ | |
const float origf = 5.0f; | |
const int raw = *(int *)&origf; | |
const size_t veclength = length / (sizeof(__m128i) / sizeof(float)); | |
const __m128i buf = _mm_set1_epi32(raw); | |
__m128i * const restrict arr = (__m128i *) _arr; | |
for (size_t i = 0; i < veclength; i += 4) { | |
_mm_store_si128(&arr[i + 0], buf); | |
_mm_store_si128(&arr[i + 1], buf); | |
_mm_store_si128(&arr[i + 2], buf); | |
_mm_store_si128(&arr[i + 3], buf); | |
} | |
} | |
#ifdef __AVX__ | |
void func7(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func7(float * const restrict arr, size_t length) { | |
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 8) { | |
_mm256_stream_ps(&arr[i], buf); | |
} | |
} | |
void func8(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func8(float * const restrict arr, size_t length) { | |
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 32) { | |
_mm256_stream_ps(&arr[i + 0], buf); | |
_mm256_stream_ps(&arr[i + 8], buf); | |
_mm256_stream_ps(&arr[i + 16], buf); | |
_mm256_stream_ps(&arr[i + 24], buf); | |
} | |
} | |
void func9(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void func9(float * const restrict arr, size_t length) { | |
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 8) { | |
_mm256_store_ps(&arr[i], buf); | |
} | |
} | |
void funcA(float * const restrict arr, size_t length) __attribute__ ((noinline)); | |
void funcA(float * const restrict arr, size_t length) { | |
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f); | |
for (int i = 0; i < length; i += 32) { | |
_mm256_store_ps(&arr[i + 0], buf); | |
_mm256_store_ps(&arr[i + 8], buf); | |
_mm256_store_ps(&arr[i + 16], buf); | |
_mm256_store_ps(&arr[i + 24], buf); | |
} | |
} | |
#endif | |
void bench(const char * restrict name, float * restrict arr, size_t length) { | |
fprintf(stderr, "bench %s, array %zu bytes (%zu floats, %zu remainder, %p pointer)\n", name, length, length / sizeof(float), length % sizeof(float), arr); | |
size_t nfloats = length / sizeof(float); | |
fprintf(stderr, "warm up round..."); | |
func1(arr, nfloats); | |
fprintf(stderr, "done\n"); | |
double baseline = tim("func1: NAIVE ", -2.0, NULL, func1, arr, nfloats); | |
tim("MEMSET CHEAT ", baseline, NULL, func0, arr, nfloats); | |
#ifdef __MACH__ | |
tim("MEMSET PATTER", baseline, NULL, funcB, arr, nfloats); | |
#endif | |
tim("NAIVE NORMAL", -1.0, NULL, func1, arr, nfloats); | |
tim("NAIVE UNROLL", baseline, NULL, func2, arr, nfloats); | |
tim("STREAM NORMAL", baseline, NULL, func3, arr, nfloats); | |
tim("STREAM UNROLL", baseline, NULL, func4, arr, nfloats); | |
tim("STORE NORMAL", baseline, NULL, func5, arr, nfloats); | |
tim("STORE UNROLL", baseline, NULL, func6, arr, nfloats); | |
tim("STORE INT ", baseline, NULL, fstore_int, arr, nfloats); | |
tim("MEMSET EMULAT", baseline, NULL, fmemset_emulate, arr, nfloats); | |
tim("STORE PREFET", baseline, NULL, fstore_prefetch, arr, nfloats); | |
// tim("STORE FLOAT ", baseline, NULL, fstore_float, arr, nfloats); | |
// for (int i = 0; i < 1; ++i) { | |
// tim("func0: MEMSET (cache polluted)", NULL, func0, arr, nfloats); | |
// tim("func1: NAIVE (cache polluted)", pollute_cache_standalone, func1, arr, nfloats); | |
// tim("func2: UNROLL (cache polluted)", pollute_cache_standalone, func2, arr, nfloats); | |
// tim("func3: STREAM (cache polluted)", pollute_cache_standalone, func3, arr, nfloats); | |
// tim("func4: STRUN (cache polluted)", pollute_cache_standalone, func4, arr, nfloats); | |
// tim("func5: STORE (cache polluted)", pollute_cache_standalone, func5, arr, nfloats); | |
// tim("func6: STOUN (cache polluted)", pollute_cache_standalone, func6, arr, nfloats); | |
// } | |
} | |
int main() { | |
timeinit(); | |
static const struct { | |
const char *name; | |
size_t bytes; | |
} sizes[] = { | |
{ "L1-HALF", L1_CACHE_BYTES / 2 }, | |
{ "L1-FULL", L1_CACHE_BYTES }, | |
{ "L2-HALF", L2_CACHE_BYTES / 2 }, | |
{ "L2-FULL", L2_CACHE_BYTES }, | |
{ "L3-HALF", L3_CACHE_BYTES / 2 }, | |
{ "L3-FULL", L3_CACHE_BYTES }, | |
{ "L3-DOUB", L3_CACHE_BYTES * 2 }, | |
{ "L3-HUGE", L3_CACHE_BYTES * 64 }, | |
{ "L3-MASS", L3_CACHE_BYTES * 256 } | |
}; | |
for (int i = 0; i < ARRAY_SIZE(sizes); ++i) { | |
size_t bytes = sizes[i].bytes; | |
/* align to cache line */ | |
float *arr = aligned_malloc(bytes, CACHE_LINE); | |
bench(sizes[i].name, arr, bytes); | |
aligned_free(arr); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment