Skip to content

Instantly share code, notes, and snippets.

@kbarbary
Created October 8, 2016 18:29
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save kbarbary/9efb3650f1b69b2b6b18e34ad347777b to your computer and use it in GitHub Desktop.
Save kbarbary/9efb3650f1b69b2b6b18e34ad347777b to your computer and use it in GitHub Desktop.
Vector-matrix-vector multiplication with SIMD (AVX) intrinsics
// Doing the operation:
//
// | a a a a | | y |
// x * A * y = [ x x x x ] | a a a a | | y |
// | a a a a | | y |
// | a a a a | | y |
//
// with SIMD intrinics (specifically AVX).
//
// adapted from https://gist.github.com/rygorous/4172889
//
// compile with cc -march=native -O3 simd-vmv.c -o simd-vmv
//
// On my haswell Core i7-4500U CPU @ 1.80GHz, The reference implementation
// is actually slightly faster or about the same. Results:
//
// all ok.
// ref: 10.46 cycles
// avx: 11.34 cycles
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
// reference implementation
void vecmatvec_ref(const double* restrict x, const double* restrict A,
const double* restrict y, double* restrict out)
{
double sum = 0.0;
for (int i=0; i < 4; i++)
for (int j=0; j < 4; j++)
sum += x[i] * A[4*i+j] * y[j];
*out = sum;
}
// maybe faster with aligned mem?
void vecmatvec_avx(const double* restrict x, const double* restrict A,
const double* restrict y, double* restrict out)
{
asm volatile ("# avx code begin"); // looking at assembly with gcc -S
__m256d yrow = _mm256_loadu_pd(y);
__m256d xrow = _mm256_loadu_pd(x);
__m256d a = _mm256_mul_pd(_mm256_loadu_pd(A), yrow);
__m256d b = _mm256_mul_pd(_mm256_loadu_pd(A+4), yrow);
__m256d c = _mm256_mul_pd(_mm256_loadu_pd(A+8), yrow);
__m256d d = _mm256_mul_pd(_mm256_loadu_pd(A+12), yrow);
// our task now is to get {sum(a), sum(b), sum(c), sum(d)}
// This is tricky because there is no hadd instruction for avx
// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
__m256d sumab = _mm256_hadd_pd(a, b);
// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
__m256d sumcd = _mm256_hadd_pd(c, d);
// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
__m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
__m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
// {sum(a), sum(b), sum(c), sum(d)}
__m256d Ay = _mm256_add_pd(perm, blend);
// Now we want to take the dotproduct: dot(x, Ay)
__m256d e = _mm256_mul_pd(Ay, xrow);
// horizontal sum of e: tricky again.
// {e[0]+e[1], e[0]+e[1], e[2]+e[3], e[2]+e[3]}
__m256d tmp = _mm256_hadd_pd(e, e);
// {e[2]+e[3], e[2]+e[3]}
__m128d e23 = _mm256_extractf128_pd(tmp, 1);
__m128d result = _mm_add_pd(_mm256_castpd256_pd128(tmp), e23);
_mm_storel_pd(out, result);
asm volatile ("# avx code end");
}
//-----------------------------------------------------------------------------
// testing stuff
// x86_64 only: "read timestamp counter"
// from http://stackoverflow.com/questions/9887839/clock-cycle-count-wth-gcc
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
static double frand()
{
return (double)rand() / (double)RAND_MAX;
}
static void randmat(double *A)
{
for (int i=0; i < 4; i++)
for (int j=0; j < 4; j++)
A[4*i+j] = frand();
}
static void randvec(double *x)
{
for (int i=0; i < 4; i++)
x[i] = frand();
}
int the_mask = 0; // global so the compiler can't be sure what its value is for opt.
static double run_ref(double *x, double *A, double *y, int count)
{
double result = 0.0;
for (int i=0; i < count; i++)
{
int j = i & the_mask;
vecmatvec_ref(x+j, A+j, y+j, &result);
}
return result;
}
static double run_avx(double *x, double *A, double *y, int count)
{
double result = 0.0;
for (int i=0; i < count; i++)
{
int j = i & the_mask;
vecmatvec_avx(x+j, A+j, y+j, &result);
}
return result;
}
int main(int argc, char **argv)
{
static const struct {
const char *name;
void (*vecmatvec)(const double *x, const double *A, const double *y,
double *out)
} variants[] = {
{ "ref", vecmatvec_ref },
{ "avx", vecmatvec_avx }
};
static const int nvars = (int) (sizeof(variants) / sizeof(*variants));
srand(1);
// correctness tests
for (int i=0; i < 1000; i++)
{
double A[16];
double x[4], y[4];
randmat(A);
randvec(x);
randvec(y);
double ref_out;
vecmatvec_ref(x, A, y, &ref_out);
double out;
for (int j=0; j < nvars; j++)
{
variants[j].vecmatvec(x, A, y, &out);
// not perfectly equal because we reordered floating point ops
if (fabs(out - ref_out) > 1e-14) {
fprintf(stderr, "%s fails test\n", variants[j].name);
exit(1);
}
}
}
printf("all ok.\n");
// perf tests
// as usual with such microbenchmarks, this isn't measuring anything
// terribly useful, but here goes.
static const struct {
const char *name;
double (*run)(double *x, double *A, double *y, int count);
} perf_variants[] = {
{ "ref", run_ref },
{ "avx", run_avx },
};
static const int nperfvars = (int) (sizeof(perf_variants) / sizeof(*perf_variants));
/*
*/
double Ap[16];
double xp[4], yp[4];
double out;
randmat(Ap);
randvec(xp);
randvec(yp);
for (int i=0; i < nvars; i++)
{
static const int nruns = 4096;
static const int muls_per_run = 4096;
unsigned long long best_time = ~0ull;
for (int run=0; run < nruns; run++)
{
unsigned long long time = rdtsc();
perf_variants[i].run(xp, Ap, yp, muls_per_run);
time = rdtsc() - time;
if (time < best_time)
best_time = time;
}
double cycles_per_run = (double) best_time / (double) muls_per_run;
printf("%12s: %.2f cycles\n", perf_variants[i].name, cycles_per_run);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment