Skip to content

Instantly share code, notes, and snippets.

@nkurz
Created December 26, 2015 23:32
Show Gist options
  • Save nkurz/9a0ed5a9a6e591019b8e to your computer and use it in GitHub Desktop.
Save nkurz/9a0ed5a9a6e591019b8e to your computer and use it in GitHub Desktop.
Are sustained loads of 64B per cycle possible on Haswell and Skylake?
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native l1d.c -o l1d
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
#ifndef SIZE
#define SIZE 4096
#endif
#ifndef REPEAT
#define REPEAT 100000
#endif
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile("cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r" (cyc_high), "=r" (cyc_low) \
:: "%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile("rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r" (cyc_high), "=r" (cyc_low) \
:: "%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while(0)
#define BEST_TIME(test, answer) \
do { \
printf("%s: ", #test); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t) -1; \
int wrong_answer = 0; \
for (int i = 0; i < REPEAT; i++) { \
__asm volatile (""::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
float result = test; \
RDTSC_FINAL(cycles_final); \
if (result != answer) wrong_answer = 1; \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
float cycles_per_input = min_diff / (float) (SIZE); \
printf(" %.2f cycles/input", cycles_per_input); \
if (wrong_answer) printf(" [ERROR]"); \
printf("\n"); \
fflush(NULL); \
} while (0)
#define VEC_LOAD_OFFSET_BASE(load, offset, base) \
__asm volatile ("vmovups %c1(%2), %0": \
"=x" (load): /* xmm or ymm destination register */ \
"i" (offset), /* constant array offset in bytes */ \
"r" (base) /* read only memory location */ \
)
#define VEC_FMA_SUM_MULT_OFFSET_BASE(sum, mult, offset, base) \
__asm volatile ("vfmadd231ps %c2(%3), %1, %0": \
"+x" (sum): /* sum = sum + (mult * [mem]) */ \
"x" (mult), /* xmm or ymm vec of floats */ \
"i" (offset), /* constant array offset in bytes */ \
"r" (base) /* read only memory location */ \
)
typedef __m256 ymm_t;
float calc_simple(float *array1, float *array2, size_t size) {
float total = 0.0;
for (size_t i = 0; i < size; i++) {
float sum = array1[i] * array2[i];
total += sum;
}
return total;
}
float calc_fma(float *array1, float *array2, size_t size) {
ymm_t sum1 = {0, 0};
ymm_t sum2 = {0, 0};
ymm_t sum3 = {0, 0};
ymm_t sum4 = {0, 0};
if (size % 32 != 0) return NAN;
for (size_t i = 0; i < size; i += 32) {
ymm_t mult1, mult2, mult3, mult4;
VEC_LOAD_OFFSET_BASE(mult1, 0, array1);
VEC_LOAD_OFFSET_BASE(mult2, 32, array1);
VEC_LOAD_OFFSET_BASE(mult3, 64, array1);
VEC_LOAD_OFFSET_BASE(mult4, 96, array1);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum1, mult1, 0, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum2, mult2, 32, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum3, mult3, 64, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum4, mult4, 96, array2);
array1 += 32;
array2 += 32;
}
sum1 = _mm256_add_ps(sum1, sum2);
sum3 = _mm256_add_ps(sum3, sum4);
sum1 = _mm256_add_ps(sum1, sum3);
ymm_t r2 = _mm256_hadd_ps(sum1, sum1);
ymm_t r3 = _mm256_hadd_ps(r2, r2);
ymm_t r4 = _mm256_hadd_ps(r3, r3);
float total = _mm_cvtss_f32(_mm256_extractf128_ps(r4,0));
return total;
}
float calc_fma_reordered(float *array1, float *array2, size_t size) {
ymm_t sum1 = {0, 0};
ymm_t sum2 = {0, 0};
ymm_t sum3 = {0, 0};
ymm_t sum4 = {0, 0};
if (size % 32 != 0) return NAN;
float *end2 = array2 + size;
while (array2 < end2) {
ymm_t mult1, mult2, mult3, mult4;
VEC_LOAD_OFFSET_BASE(mult4, 96, array1);
VEC_LOAD_OFFSET_BASE(mult2, 32, array1);
VEC_LOAD_OFFSET_BASE(mult3, 64, array1);
VEC_LOAD_OFFSET_BASE(mult1, 0, array1);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum4, mult4, 96, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum2, mult2, 32, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum3, mult3, 64, array2);
VEC_FMA_SUM_MULT_OFFSET_BASE(sum1, mult1, 0, array2);
array1 += 32;
array2 += 32;
}
sum1 = _mm256_add_ps(sum1, sum2);
sum3 = _mm256_add_ps(sum3, sum4);
sum1 = _mm256_add_ps(sum1, sum3);
ymm_t r2 = _mm256_hadd_ps(sum1, sum1);
ymm_t r3 = _mm256_hadd_ps(r2, r2);
ymm_t r4 = _mm256_hadd_ps(r3, r3);
float total = _mm_cvtss_f32(_mm256_extractf128_ps(r4,0));
return total;
}
float calc_load_only(float *array1, float *array2, size_t size) {
if (size % 32 != 0) return NAN;
for (size_t i = 0; i < size; i += 32) {
ymm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 0, array1);
VEC_LOAD_OFFSET_BASE(dummy, 32, array1);
VEC_LOAD_OFFSET_BASE(dummy, 64, array1);
VEC_LOAD_OFFSET_BASE(dummy, 96, array1);
VEC_LOAD_OFFSET_BASE(dummy, 0, array2);
VEC_LOAD_OFFSET_BASE(dummy, 32, array2);
VEC_LOAD_OFFSET_BASE(dummy, 64, array2);
VEC_LOAD_OFFSET_BASE(dummy, 96, array2);
array1 += 32;
array2 += 32;
}
return 0.0;
}
float calc_load_only_reordered(float *array1, float *array2, size_t size) {
if (size % 32 != 0) return NAN;
float *end2 = array2 + size;
while (array2 < end2) {
ymm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 96, array1);
VEC_LOAD_OFFSET_BASE(dummy, 32, array1);
VEC_LOAD_OFFSET_BASE(dummy, 64, array1);
VEC_LOAD_OFFSET_BASE(dummy, 0, array1);
VEC_LOAD_OFFSET_BASE(dummy, 96, array2);
VEC_LOAD_OFFSET_BASE(dummy, 32, array2);
VEC_LOAD_OFFSET_BASE(dummy, 64, array2);
VEC_LOAD_OFFSET_BASE(dummy, 0, array2);
array1 += 32;
array2 += 32;
}
return 0.0;
}
int main(int argc, char **argv) {
printf("Testing with SIZE=%d...\n", SIZE);
size_t size = SIZE;
float *array1 = malloc(SIZE * sizeof(float));
float *array2 = malloc(SIZE * sizeof(float));
for (size_t i = 0; i < size; i++) {
array1[i] = 1.0;
array2[i] = 2.0;
}
float answer = calc_simple(array1, array2, size);
BEST_TIME(calc_simple(array1, array2, size), answer);
BEST_TIME(calc_fma(array1, array2, size), answer);
BEST_TIME(calc_fma_reordered(array1, array2, size), answer);
BEST_TIME(calc_load_only(array1, array2, size), answer);
BEST_TIME(calc_load_only_reordered(array1, array2, size), answer);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment