Skip to content

Instantly share code, notes, and snippets.

@nandor
Created January 13, 2014 20:15
Show Gist options
  • Save nandor/8407239 to your computer and use it in GitHub Desktop.
Save nandor/8407239 to your computer and use it in GitHub Desktop.
Fast SSE matrix multiplication. It might be three times faster.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <xmmintrin.h>
typedef float __attribute__((aligned(16))) mat4[16];
void
mat4_mul_sse(mat4 d, mat4 a, mat4 b)
{
__m128 acc, al, bl;
/* Column 0 */
al = _mm_load_ps(a);
bl = _mm_set_ps1(b[0]);
acc = _mm_mul_ps(al, bl);
al = _mm_load_ps(a + 4);
bl = _mm_set_ps1(b[1]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 8);
bl = _mm_set_ps1(b[2]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 12);
bl = _mm_set_ps1(b[3]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
_mm_store_ps(d, acc);
/* Column 1 */
al = _mm_load_ps(a);
bl = _mm_set_ps1(b[4]);
acc = _mm_mul_ps(al, bl);
al = _mm_load_ps(a + 4);
bl = _mm_set_ps1(b[5]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 8);
bl = _mm_set_ps1(b[6]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 12);
bl = _mm_set_ps1(b[7]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
_mm_store_ps(d + 4, acc);
/* Column 2 */
al = _mm_load_ps(a);
bl = _mm_set_ps1(b[8]);
acc = _mm_mul_ps(al, bl);
al = _mm_load_ps(a + 4);
bl = _mm_set_ps1(b[9]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 8);
bl = _mm_set_ps1(b[10]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 12);
bl = _mm_set_ps1(b[11]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
_mm_store_ps(d + 8, acc);
/* Column 3 */
al = _mm_load_ps(a);
bl = _mm_set_ps1(b[12]);
acc = _mm_mul_ps(al, bl);
al = _mm_load_ps(a + 4);
bl = _mm_set_ps1(b[13]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 8);
bl = _mm_set_ps1(b[14]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
al = _mm_load_ps(a + 12);
bl = _mm_set_ps1(b[15]);
acc = _mm_add_ps(acc, _mm_mul_ps(al, bl));
_mm_store_ps(d + 12, acc);
}
void
mat4_mul(mat4 x, mat4 y, mat4 z)
{
x[ 0] = y[0] * z[ 0] + y[4] * z[ 1] + y[ 8] * z[ 2] + y[12] * z[ 3];
x[ 1] = y[0] * z[ 4] + y[4] * z[ 5] + y[ 8] * z[ 6] + y[12] * z[ 7];
x[ 2] = y[0] * z[ 8] + y[4] * z[ 9] + y[ 8] * z[10] + y[12] * z[11];
x[ 3] = y[0] * z[12] + y[4] * z[13] + y[ 8] * z[14] + y[12] * z[15];
x[ 4] = y[1] * z[ 0] + y[5] * z[ 1] + y[ 9] * z[ 2] + y[13] * z[ 3];
x[ 5] = y[1] * z[ 4] + y[5] * z[ 5] + y[ 9] * z[ 6] + y[13] * z[ 7];
x[ 6] = y[1] * z[ 8] + y[5] * z[ 9] + y[ 9] * z[10] + y[13] * z[11];
x[ 7] = y[1] * z[12] + y[5] * z[13] + y[ 9] * z[14] + y[13] * z[15];
x[ 8] = y[2] * z[ 0] + y[6] * z[ 1] + y[10] * z[ 2] + y[14] * z[ 3];
x[ 9] = y[2] * z[ 4] + y[6] * z[ 5] + y[10] * z[ 6] + y[14] * z[ 7];
x[10] = y[2] * z[ 8] + y[6] * z[ 9] + y[10] * z[10] + y[14] * z[11];
x[11] = y[2] * z[12] + y[6] * z[13] + y[10] * z[14] + y[14] * z[15];
x[12] = y[3] * z[ 0] + y[7] * z[ 1] + y[11] * z[ 2] + y[15] * z[ 3];
x[13] = y[3] * z[ 4] + y[7] * z[ 5] + y[11] * z[ 6] + y[15] * z[ 7];
x[14] = y[3] * z[ 8] + y[7] * z[ 9] + y[11] * z[10] + y[15] * z[11];
x[15] = y[3] * z[12] + y[7] * z[13] + y[11] * z[14] + y[15] * z[15];
}
int main(int argc, int argv)
{
mat4 a, b, d;
clock_t beg, end;
register int i;
a[0] = 1.0f; a[4] = 0.0f; a[ 8] = 0.0f; a[12] = 0.0f;
a[1] = 0.0f; a[5] = 1.0f; a[ 9] = 0.0f; a[13] = 0.0f;
a[2] = 0.0f; a[6] = 0.0f; a[10] = 1.0f; a[14] = 0.0f;
a[3] = 0.0f; a[7] = 0.0f; a[11] = 0.0f; a[15] = 1.0f;
b[0] = 5.0f; b[4] = 0.0f; b[ 8] = 0.0f; b[12] = 0.0f;
b[1] = 0.0f; b[5] = 5.0f; b[ 9] = 0.0f; b[13] = 0.0f;
b[2] = 0.0f; b[6] = 0.0f; b[10] = 5.0f; b[14] = 0.0f;
b[3] = 0.0f; b[7] = 0.0f; b[11] = 0.0f; b[15] = 5.0f;
beg = clock();
for (i = 0; i < 100000000; ++i)
{
mat4_mul(d, a, b);
}
printf("%f\n", (float)(clock() - beg) / CLOCKS_PER_SEC);
beg = clock();
for (i = 0; i < 100000000; ++i)
{
mat4_mul_sse(d, a, b);
}
printf("%f\n", (float)(clock() - beg) / CLOCKS_PER_SEC);
mat4_mul(d, a, b);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment