Last active
October 23, 2019 21:10
-
-
Save pendingchaos/0fe82d6d264cb68cb5f4 to your computer and use it in GitHub Desktop.
SSE 4x4 matrix multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <xmmintrin.h> | |
//~2.4x faster than non-SSE unrolled version. | |
//Uses row-major order (D3D or non-OpenGL layout). | |
void mul(float result[4][4], float a[4][4], float b[4][4]) | |
{ | |
__m128 otherRow0 = _mm_loadu_ps(b[0]); | |
__m128 otherRow1 = _mm_loadu_ps(b[1]); | |
__m128 otherRow2 = _mm_loadu_ps(b[2]); | |
__m128 otherRow3 = _mm_loadu_ps(b[3]); | |
__m128 newRow0 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[0][0])); | |
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow1, _mm_set1_ps(a[0][1]))); | |
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow2, _mm_set1_ps(a[0][2]))); | |
newRow0 = _mm_add_ps(newRow0, _mm_mul_ps(otherRow3, _mm_set1_ps(a[0][3]))); | |
__m128 newRow1 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[1][0])); | |
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow1, _mm_set1_ps(a[1][1]))); | |
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow2, _mm_set1_ps(a[1][2]))); | |
newRow1 = _mm_add_ps(newRow1, _mm_mul_ps(otherRow3, _mm_set1_ps(a[1][3]))); | |
__m128 newRow2 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[2][0])); | |
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow1, _mm_set1_ps(a[2][1]))); | |
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow2, _mm_set1_ps(a[2][2]))); | |
newRow2 = _mm_add_ps(newRow2, _mm_mul_ps(otherRow3, _mm_set1_ps(a[2][3]))); | |
__m128 newRow3 = _mm_mul_ps(otherRow0, _mm_set1_ps(a[3][0])); | |
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow1, _mm_set1_ps(a[3][1]))); | |
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow2, _mm_set1_ps(a[3][2]))); | |
newRow3 = _mm_add_ps(newRow3, _mm_mul_ps(otherRow3, _mm_set1_ps(a[3][3]))); | |
_mm_storeu_ps(result[0], newRow0); | |
_mm_storeu_ps(result[1], newRow1); | |
_mm_storeu_ps(result[2], newRow2); | |
_mm_storeu_ps(result[3], newRow3); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment