Skip to content

Instantly share code, notes, and snippets.

@nkurz
Created December 27, 2015 23:41
Show Gist options
  • Save nkurz/439ca1044e11181c1089 to your computer and use it in GitHub Desktop.
Save nkurz/439ca1044e11181c1089 to your computer and use it in GitHub Desktop.
Alignment strongly affects vector load bandwidth
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
#include <malloc.h>
#ifndef SIZE // number of floats to load per test
#define SIZE 4096
#endif
#ifndef RETRY // retries of test to find minimum time
#define RETRY 100000
#endif
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile("cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r" (cyc_high), "=r" (cyc_low) \
:: "%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile("rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r" (cyc_high), "=r" (cyc_low) \
:: "%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while(0)
#define BEST_TIME(func, args...) \
do { \
printf("%-30s: ", #func); \
fflush(NULL); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t) -1; \
for (int i = 0; i < RETRY; i++) { \
__asm volatile (""::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
func(args); \
RDTSC_FINAL(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
float bytes_per_cycle = SIZE*sizeof(float)/(float)cycles_diff; \
printf("%.2f bytes/cycle\n", bytes_per_cycle); \
fflush(NULL); \
} while (0)
#define VEC_LOAD_OFFSET_BASE(load, offset, base) \
__asm volatile ("vmovups %c1(%2), %0": \
"=x" (load): /* xmm or ymm destination register */ \
"i" (offset), /* constant array offset in bytes */ \
"r" (base) /* read only memory location */ \
)
typedef __m256 ymm_t;
typedef __m128 xmm_t;
// issue 32B loads covering all elements in array (linear order)
void load_ymm(float *array, size_t size) {
if (size % 32 != 0) exit(1);
float *end = array + size;
while (array < end) {
ymm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 0, array);
VEC_LOAD_OFFSET_BASE(dummy, 32, array);
VEC_LOAD_OFFSET_BASE(dummy, 64, array);
VEC_LOAD_OFFSET_BASE(dummy, 96, array);
array += 32;
}
}
// issue 32B loads covering all elements in array (non-sequential)
void load_ymm_nonsequential(float *array, size_t size) {
if (size % 32 != 0) exit(1);
float *end = array + size;
while (array < end) {
ymm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 96, array);
VEC_LOAD_OFFSET_BASE(dummy, 32, array);
VEC_LOAD_OFFSET_BASE(dummy, 64, array);
VEC_LOAD_OFFSET_BASE(dummy, 0, array);
array += 32;
}
}
// issue 16B loads for all elements in array (linear order)
void load_xmm(float *array, size_t size) {
if (size % 32 != 0) exit(1);
float *end = array + size;
while (array < end) {
xmm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 0, array);
VEC_LOAD_OFFSET_BASE(dummy, 16, array);
VEC_LOAD_OFFSET_BASE(dummy, 32, array);
VEC_LOAD_OFFSET_BASE(dummy, 48, array);
VEC_LOAD_OFFSET_BASE(dummy, 64, array);
VEC_LOAD_OFFSET_BASE(dummy, 80, array);
VEC_LOAD_OFFSET_BASE(dummy, 96, array);
VEC_LOAD_OFFSET_BASE(dummy, 112, array);
array += 32;
}
}
// issue 16B loads for all elements in array (nonsequential)
void load_xmm_nonsequential(float *array, size_t size) {
if (size % 32 != 0) exit(1);
float *end = array + size;
while (array < end) {
xmm_t dummy;
VEC_LOAD_OFFSET_BASE(dummy, 0, array);
VEC_LOAD_OFFSET_BASE(dummy, 64, array);
VEC_LOAD_OFFSET_BASE(dummy, 32, array);
VEC_LOAD_OFFSET_BASE(dummy, 96, array);
VEC_LOAD_OFFSET_BASE(dummy, 16, array);
VEC_LOAD_OFFSET_BASE(dummy, 80, array);
VEC_LOAD_OFFSET_BASE(dummy, 48, array);
VEC_LOAD_OFFSET_BASE(dummy, 112, array);
array += 32;
}
}
int main(int argc, char **argv) {
size_t size = SIZE;
size_t raw_align = 64;
size_t raw_size = size * sizeof(float) + raw_align;
void *raw_ptr = memalign(raw_align, raw_size);
memset(raw_ptr, 0, raw_size);
printf("Loading %d floats with %ld byte raw alignment\n", SIZE, raw_align);
for (size_t offset = 8; offset <= 32; offset += 8) {
float *array = raw_ptr + offset;
printf("Vector alignment %ld:\n", offset);
BEST_TIME(load_xmm, array, size);
BEST_TIME(load_xmm_nonsequential, array, size);
BEST_TIME(load_ymm, array, size);
BEST_TIME(load_ymm_nonsequential, array, size);
printf("\n");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment