Skip to content

Instantly share code, notes, and snippets.

Last active October 23, 2019 21:10
Show Gist options
  • Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.
Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.
SSE 4x4 matrix multiplication
#include <xmmintrin.h>
//~2.4x faster than non-SSE unrolled version.
//Uses row-major order (D3D or non-OpenGL layout).
void mul(float result[4][4], float a[4][4], float b[4][4])
__m128 otherRow0 = _mm_loadu_ps(b[0]);
__m128 otherRow1 = _mm_loadu_ps(b[1]);
__m128 otherRow2 = _mm_loadu_ps(b[2]);
__m128 otherRow3 = _mm_loadu_ps(b[3]);
__m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0]));
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1])));
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2])));
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3])));
__m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0]));
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1])));
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2])));
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3])));
__m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0]));
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1])));
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2])));
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3])));
__m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0]));
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1])));
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2])));
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3])));
_mm_storeu_ps(result[0], newRow0);
_mm_storeu_ps(result[1], newRow1);
_mm_storeu_ps(result[2], newRow2);
_mm_storeu_ps(result[3], newRow3);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment