Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created November 30, 2012 00:28
Show Gist options
  • Save rygorous/4172889 to your computer and use it in GitHub Desktop.
Save rygorous/4172889 to your computer and use it in GitHub Desktop.
SSE/AVX matrix multiply
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
union Mat44 {
float m[4][4];
__m128 row[4];
};
// reference implementation
void matmult_ref(Mat44 &out, const Mat44 &A, const Mat44 &B)
{
Mat44 t; // write to temp
for (int i=0; i < 4; i++)
for (int j=0; j < 4; j++)
t.m[i][j] = A.m[i][0]*B.m[0][j] + A.m[i][1]*B.m[1][j] + A.m[i][2]*B.m[2][j] + A.m[i][3]*B.m[3][j];
out = t;
}
// linear combination:
// a[0] * B.row[0] + a[1] * B.row[1] + a[2] * B.row[2] + a[3] * B.row[3]
static inline __m128 lincomb_SSE(const __m128 &a, const Mat44 &B)
{
__m128 result;
result = _mm_mul_ps(_mm_shuffle_ps(a, a, 0x00), B.row[0]);
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a, a, 0x55), B.row[1]));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a, a, 0xaa), B.row[2]));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a, a, 0xff), B.row[3]));
return result;
}
// this is the right approach for SSE ... SSE4.2
void matmult_SSE(Mat44 &out, const Mat44 &A, const Mat44 &B)
{
// out_ij = sum_k a_ik b_kj
// => out_0j = a_00 * b_0j + a_01 * b_1j + a_02 * b_2j + a_03 * b_3j
__m128 out0x = lincomb_SSE(A.row[0], B);
__m128 out1x = lincomb_SSE(A.row[1], B);
__m128 out2x = lincomb_SSE(A.row[2], B);
__m128 out3x = lincomb_SSE(A.row[3], B);
out.row[0] = out0x;
out.row[1] = out1x;
out.row[2] = out2x;
out.row[3] = out3x;
}
// another linear combination, using AVX instructions on XMM regs
static inline __m128 lincomb_AVX_4mem(const float *a, const Mat44 &B)
{
__m128 result;
result = _mm_mul_ps(_mm_broadcast_ss(&a[0]), B.row[0]);
result = _mm_add_ps(result, _mm_mul_ps(_mm_broadcast_ss(&a[1]), B.row[1]));
result = _mm_add_ps(result, _mm_mul_ps(_mm_broadcast_ss(&a[2]), B.row[2]));
result = _mm_add_ps(result, _mm_mul_ps(_mm_broadcast_ss(&a[3]), B.row[3]));
return result;
}
// using AVX instructions, 4-wide
// this can be better if A is in memory.
void matmult_AVX_4mem(Mat44 &out, const Mat44 &A, const Mat44 &B)
{
_mm256_zeroupper();
__m128 out0x = lincomb_AVX_4mem(A.m[0], B);
__m128 out1x = lincomb_AVX_4mem(A.m[1], B);
__m128 out2x = lincomb_AVX_4mem(A.m[2], B);
__m128 out3x = lincomb_AVX_4mem(A.m[3], B);
out.row[0] = out0x;
out.row[1] = out1x;
out.row[2] = out2x;
out.row[3] = out3x;
}
// dual linear combination using AVX instructions on YMM regs
static inline __m256 twolincomb_AVX_8(__m256 A01, const Mat44 &B)
{
__m256 result;
result = _mm256_mul_ps(_mm256_shuffle_ps(A01, A01, 0x00), _mm256_broadcast_ps(&B.row[0]));
result = _mm256_add_ps(result, _mm256_mul_ps(_mm256_shuffle_ps(A01, A01, 0x55), _mm256_broadcast_ps(&B.row[1])));
result = _mm256_add_ps(result, _mm256_mul_ps(_mm256_shuffle_ps(A01, A01, 0xaa), _mm256_broadcast_ps(&B.row[2])));
result = _mm256_add_ps(result, _mm256_mul_ps(_mm256_shuffle_ps(A01, A01, 0xff), _mm256_broadcast_ps(&B.row[3])));
return result;
}
// this should be noticeably faster with actual 256-bit wide vector units (Intel);
// not sure about double-pumped 128-bit (AMD), would need to check.
void matmult_AVX_8(Mat44 &out, const Mat44 &A, const Mat44 &B)
{
_mm256_zeroupper();
__m256 A01 = _mm256_loadu_ps(&A.m[0][0]);
__m256 A23 = _mm256_loadu_ps(&A.m[2][0]);
__m256 out01x = twolincomb_AVX_8(A01, B);
__m256 out23x = twolincomb_AVX_8(A23, B);
_mm256_storeu_ps(&out.m[0][0], out01x);
_mm256_storeu_ps(&out.m[2][0], out23x);
}
// ---- testing stuff
static float randf()
{
// assumes VC++ rand()
return (rand() - 16384.0f) / 1024.0f;
}
static void randmat(Mat44 &M)
{
for (int i=0; i < 4; i++)
for (int j=0; j < 4; j++)
M.m[i][j] = randf();
}
int the_mask = 0; // global so the compiler can't be sure what its value is for opt.
static void run_ref(Mat44 *out, const Mat44 *A, const Mat44 *B, int count)
{
for (int i=0; i < count; i++)
{
int j = i & the_mask;
matmult_ref(out[j], A[j], B[j]);
}
}
static void run_SSE(Mat44 *out, const Mat44 *A, const Mat44 *B, int count)
{
for (int i=0; i < count; i++)
{
int j = i & the_mask;
matmult_SSE(out[j], A[j], B[j]);
}
}
static void run_AVX_4mem(Mat44 *out, const Mat44 *A, const Mat44 *B, int count)
{
for (int i=0; i < count; i++)
{
int j = i & the_mask;
matmult_AVX_4mem(out[j], A[j], B[j]);
}
}
static void run_AVX_8(Mat44 *out, const Mat44 *A, const Mat44 *B, int count)
{
for (int i=0; i < count; i++)
{
int j = i & the_mask;
matmult_AVX_8(out[j], A[j], B[j]);
}
}
int main(int argc, char **argv)
{
static const struct {
const char *name;
void (*matmult)(Mat44 &out, const Mat44 &A, const Mat44 &B);
} variants[] = {
{ "ref", matmult_ref },
{ "SSE", matmult_SSE },
{ "AVX_4mem", matmult_AVX_4mem },
{ "AVX_8", matmult_AVX_8 },
};
static const int nvars = (int) (sizeof(variants) / sizeof(*variants));
srand(1234); // deterministic random tests(TM)
// correctness tests
// when compiled with /arch:SSE (or SSE2/AVX), all functions are
// supposed to return the exact same results!
for (int i=0; i < 1000000; i++)
{
Mat44 A, B, out, ref_out;
randmat(A);
randmat(B);
matmult_ref(ref_out, A, B);
for (int j=0; j < nvars; j++)
{
variants[j].matmult(out, A, B);
if (memcmp(&out, &ref_out, sizeof(out)) != 0)
{
fprintf(stderr, "%s fails test\n", variants[j].name);
exit(1);
}
}
}
printf("all ok.\n");
// perf tests
// as usual with such microbenchmarks, this isn't measuring anything
// terribly useful, but here goes.
static const struct {
const char *name;
void (*run)(Mat44 *out, const Mat44 *A, const Mat44 *B, int count);
} perf_variants[] = {
{ "ref", run_ref },
{ "SSE", run_SSE },
{ "AVX_4mem", run_AVX_4mem },
{ "AVX_8", run_AVX_8 },
};
static const int nperfvars = (int) (sizeof(perf_variants) / sizeof(*perf_variants));
/*
results on my sandy bridge laptop when compiling the code in x64
mode with VC2010 using /arch:AVX:
all ok.
ref: 59.00 cycles
SSE: 20.52 cycles
AVX_4mem: 15.64 cycles
AVX_8: 14.13 cycles
*/
Mat44 Aperf, Bperf, out;
randmat(Aperf);
randmat(Bperf);
for (int i=0; i < nvars; i++)
{
static const int nruns = 4096;
static const int muls_per_run = 4096;
unsigned long long best_time = ~0ull;
for (int run=0; run < nruns; run++)
{
unsigned long long time = __rdtsc();
perf_variants[i].run(&out, &Aperf, &Bperf, muls_per_run);
time = __rdtsc() - time;
if (time < best_time)
best_time = time;
}
double cycles_per_run = (double) best_time / (double) muls_per_run;
printf("%12s: %.2f cycles\n", perf_variants[i].name, cycles_per_run);
}
return 0;
}
@jsimmons
Copy link

For somewhat worthless comparison here are the numbers for GCC 4.7.2 and Clang 3.1 respectively on an i7 2600k.

     ref: 80.94 cycles
     SSE: 27.26 cycles
AVX_4mem: 23.92 cycles
   AVX_8: 17.68 cycles

and

     ref: 73.73 cycles
     SSE: 26.88 cycles
AVX_4mem: 16.11 cycles
   AVX_8: 15.56 cycles

Considering the presumably? more powerful CPU the slightly crappier performance is interesting.

@rygorous
Copy link
Author

It's not very useful to compare the cycle counts across machines with a test like this; I make no effort to compensate for thread switches and things like that for example. This test is useful to compare relative performance on one machine given whatever the background load on that machine is, but for actual benchmarking you'd do it differently. (Making sure the cores are actually running at their nominal clock rate for example - RDTSC counts ticks of the nominal clock rate, not actual CPU cycles elapsed at whatever current clock it's running at)

@etale-cohomology
Copy link

More cycles is good or bad? Ie. is 100 cycles better than 10 cycles?

@pedrocr
Copy link

pedrocr commented Jun 5, 2017

For the same thing to compile on linux #include <intrin.h> needs to be replaced with #include <x86intrin.h>. With that single change it compiles with g++ -O3 -march=native -o test_matrix test_matrix.cpp. Using g++ 5.4.0 20160609 and on an i5-6200U I get the following results:

all ok.
         ref: 44.61 cycles
         SSE: 14.28 cycles
    AVX_4mem: 16.16 cycles
       AVX_8: 9.09 cycles

@Imroy
Copy link

Imroy commented Jun 6, 2019

Experimenting with this using GCC 8.3 on a Ryzen (using -march=znver1). It looks like the optimisation level (-Ox) makes a big difference.

$ g++ -march=znver1 -o matmult matmult.cc
$ ./matmult
all ok.
         ref: 244.61 cycles
         SSE: 174.76 cycles
    AVX_4mem: 190.32 cycles
       AVX_8: 169.63 cycles

Wow, really bad!

$ g++ -Os -march=znver1 -o matmult-Os matmult.cc
$ ./matmult-Os
all ok.
         ref: 76.11 cycles
         SSE: 22.48 cycles
    AVX_4mem: 24.22 cycles
       AVX_8: 16.44 cycles

Better.

$ g++ -O1 -march=znver1 -o matmult-O1 matmult.cc
$ ./matmult-O1
all ok.
         ref: 52.59 cycles
         SSE: 15.61 cycles
    AVX_4mem: 19.03 cycles
       AVX_8: 19.90 cycles
$ g++ -O2 -march=znver1 -o matmult-O2 matmult.cc
$ ./matmult-O2
all ok.
         ref: 55.53 cycles
         SSE: 15.72 cycles
    AVX_4mem: 19.31 cycles
       AVX_8: 24.48 cycles

Better still, but -O1 and -O2 are about the same.

$ g++ -O3 -march=znver1 -o matmult-O3 matmult.cc
$ ./matmult-O3
all ok.
         ref: 14.60 cycles
         SSE: 14.50 cycles
    AVX_4mem: 18.84 cycles
       AVX_8: 17.89 cycles

Great! But now every variant is about the same. Almost any difference is optimised out, even for the simple reference implementation. So what's the point of making heavily optimised (platform specific) versions if the compiler can do a better job from simple straight-forward code? Just more jobs being taken by machines :P
And yes, I checked the assembly output from 'gcc -S'. In the -O3 code there's lots of v* instructions and xmm/ymm registers. Even FMA instructions too!
Thanks for writing this code though. It was interesting to play around with.

@LondNoir
Copy link

LondNoir commented Sep 2, 2019

londnoir@pc-londnoir:$ g++ -O3 -march=native -o test_matrix test_matrix.cpp
londnoir@pc-londnoir:
$ ./test_matrix
all ok.
ref: 18.52 cycles
SSE: 25.33 cycles
AVX_4mem: 22.08 cycles
AVX_8: 20.33 cycles

on a "Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz" with "gcc version 9.2.1 20190821 (Debian 9.2.1-4)"

@jameskeane
Copy link

I must be doing something wrong on "Intel(R) Core(TM) i5-6360U CPU @ 2.00GHz" (Skylake)
Just look at Clang's SSE + AVX4 times...

// Apple LLVM version 9.1.0 (clang-902.0.39.2)
>g++ -O3 -march=native perf.cpp -o bench
>./bench
all ok.
         ref: 16.16089 cycles
         SSE: 0.00317 cycles
    AVX_4mem: 0.65723 cycles
       AVX_8: 6.98901 cycles

btw on GCC 9.2.0, ref is only slightly slower than AVX8:

>g++-9 -O3 -march=native perf.cpp -o bench
>./bench 
all ok.
         ref: 7.72705 cycles
         SSE: 10.37695 cycles
    AVX_4mem: 10.27856 cycles
       AVX_8: 6.47656 cycles

@rygorous
Copy link
Author

rygorous commented Nov 4, 2019

Looks like the first test gets constant-propagated on newer Clangs. You need to check the disassembly.

But either way this code was posted 7 years ago as a simple AVX example for someone else. If your 2019-vintage compiler autovectorizes it, great. Compilers 7 years ago did not.

@IGBC
Copy link

IGBC commented Apr 20, 2020

g++ (Arch Linux 9.3.0-1) 9.3.0
Ryzen 9 3900X (Zen 2)

g++ avxmult.cpp -o avxmult -march=native -O3

         ref: 9.77 cycles
         SSE: 14.96 cycles
    AVX_4mem: 15.31 cycles
       AVX_8: 8.28 cycles

Had to patch RDTSC to make it run using this patch: https://stackoverflow.com/a/9887899

@rygorous
Copy link
Author

I am not sure who all you people are, where you're coming from, or why you think these comments have any relevance to a gist posted 8 years ago before any of the compiler versions you're using existed.

@IGBC
Copy link

IGBC commented Apr 21, 2020 via email

@dpotop
Copy link

dpotop commented Apr 21, 2021

Thanks for this piece of code (it really helped me). You might consider adding __attribute__((noinline)) on the run_* functions, to avoid code unrolling and constant propagation. In my setting it is mandatory. If you are interested I can also contribute a C version of this C++ code.

@DBJDBJ
Copy link

DBJDBJ commented Jul 12, 2021

@dpotop I am interested in the C version of this code

@Lovejane-N
Copy link

void matmult_ref(Mat44 &out, const Mat44 &A, const Mat44 &B)
{
Mat44 t; // write to temp
t.m[0][0] = A.m[0][0]*B.m[0][0] + A.m[0][1]*B.m[1][0] + A.m[0][2]*B.m[2][0] + A.m[0][3]*B.m[3][0];
t.m[0][1] = A.m[0][0]*B.m[0][1] + A.m[0][1]*B.m[1][1] + A.m[0][2]*B.m[2][1] + A.m[0][3]*B.m[3][1];
t.m[0][2] = A.m[0][0]*B.m[0][2] + A.m[0][1]*B.m[1][2] + A.m[0][2]*B.m[2][2] + A.m[0][3]*B.m[3][2];
t.m[0][3] = A.m[0][0]*B.m[0][3] + A.m[0][1]*B.m[1][3] + A.m[0][2]*B.m[2][3] + A.m[0][3]*B.m[3][3];

t.m[1][0] = A.m[1][0]*B.m[0][0] + A.m[1][1]*B.m[1][0] + A.m[1][2]*B.m[2][0] + A.m[1][3]*B.m[3][0];
t.m[1][1] = A.m[1][0]*B.m[0][1] + A.m[1][1]*B.m[1][1] + A.m[1][2]*B.m[2][1] + A.m[1][3]*B.m[3][1];
t.m[1][2] = A.m[1][0]*B.m[0][2] + A.m[1][1]*B.m[1][2] + A.m[1][2]*B.m[2][2] + A.m[1][3]*B.m[3][2];
t.m[1][3] = A.m[1][0]*B.m[0][3] + A.m[1][1]*B.m[1][3] + A.m[1][2]*B.m[2][3] + A.m[1][3]*B.m[3][3];

t.m[2][0] = A.m[2][0]*B.m[0][0] + A.m[2][1]*B.m[1][0] + A.m[2][2]*B.m[2][0] + A.m[2][3]*B.m[3][0];
t.m[2][1] = A.m[2][0]*B.m[0][1] + A.m[2][1]*B.m[1][1] + A.m[2][2]*B.m[2][1] + A.m[2][3]*B.m[3][1];
t.m[2][2] = A.m[2][0]*B.m[0][2] + A.m[2][1]*B.m[1][2] + A.m[2][2]*B.m[2][2] + A.m[2][3]*B.m[3][2];
t.m[2][3] = A.m[2][0]*B.m[0][3] + A.m[2][1]*B.m[1][3] + A.m[2][2]*B.m[2][3] + A.m[2][3]*B.m[3][3];

t.m[3][0] = A.m[3][0]*B.m[0][0] + A.m[3][1]*B.m[1][0] + A.m[3][2]*B.m[2][0] + A.m[3][3]*B.m[3][0];
t.m[3][1] = A.m[3][0]*B.m[0][1] + A.m[3][1]*B.m[1][1] + A.m[3][2]*B.m[2][1] + A.m[3][3]*B.m[3][1];
t.m[3][2] = A.m[3][0]*B.m[0][2] + A.m[3][1]*B.m[1][2] + A.m[3][2]*B.m[2][2] + A.m[3][3]*B.m[3][2];
t.m[3][3] = A.m[3][0]*B.m[0][3] + A.m[3][1]*B.m[1][3] + A.m[3][2]*B.m[2][3] + A.m[3][3]*B.m[3][3];

out = t;

}

Using for-loop will slow down the code significantly, that's why O(1,2,3) options can make the code faster.

@Catoverflow
Copy link

Catoverflow commented Jun 2, 2022

g++ (GCC) 12.1.0, Arch linux 5.18.1, Intel i7-9750H
g++ -mavx -O2

all ok.
         ref: 185.01 cycles
         SSE: 120.06 cycles
    AVX_4mem: 118.66 cycles
       AVX_8: 72.41 cycles

g++ -mavx -O3

all ok.
         ref: 7.39 cycles
         SSE: 12.07 cycles
    AVX_4mem: 12.44 cycles
       AVX_8: 8.17 cycles

Also, intrin.h is not included


I checked asembly code in compiler explorer, with -O2 option the ref function is compiled into AVX, and the inner loop (line 17) is unfolded. By -O3 the outer loop (line 16) is also unfolded, exactly as @Lovejane-N 's comment above. Interestingly, with -O3 both native ref and AVX-8 have no loop, buf the former one runs faster.


And for @rygorous 's questions, I'm doing Computer Architecture experiment, and this code is a great help!

@ignabelitzky
Copy link

I compile and exeecute like this and replacing the library <intrin.h> for <x86intrin.h>

g++ -O3 -march=native main.cpp
./a.out

I got this output on i9 9900KF - GCC 13.1.1 - Fedora Linux

all ok.
         ref: 8.65 cycles
         SSE: 9.02 cycles
    AVX_4mem: 11.53 cycles
       AVX_8: 7.71 cycles

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment