Skip to content

Instantly share code, notes, and snippets.

@IJzerbaard
Created August 6, 2019 13:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IJzerbaard/290e51c8df9f1e509609229a9541af67 to your computer and use it in GitHub Desktop.
Save IJzerbaard/290e51c8df9f1e509609229a9541af67 to your computer and use it in GitHub Desktop.
void cofactorSSE(const float src[16], float dst[16]) {
__m128 r0 = _mm_load_ps(&src[0]);
__m128 r1 = _mm_load_ps(&src[4]);
__m128 r2 = _mm_load_ps(&src[8]);
__m128 r3 = _mm_load_ps(&src[12]);
__m128 r0_0001 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(0, 0, 0, 1));
__m128 r1_0001 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 0, 0, 1));
__m128 r2_0001 = _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1));
__m128 r3_0001 = _mm_shuffle_ps(r3, r3, _MM_SHUFFLE(0, 0, 0, 1));
__m128 r0_1122 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(1, 1, 2, 2));
__m128 r1_1122 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 2, 2));
__m128 r2_1122 = _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(1, 1, 2, 2));
__m128 r3_1122 = _mm_shuffle_ps(r3, r3, _MM_SHUFFLE(1, 1, 2, 2));
__m128 r0_2333 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(2, 3, 3, 3));
__m128 r1_2333 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 3, 3));
__m128 r2_2333 = _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(2, 3, 3, 3));
__m128 r3_2333 = _mm_shuffle_ps(r3, r3, _MM_SHUFFLE(2, 3, 3, 3));
__m128 odd = _mm_set_ps(0.0, -0.0, 0.0, -0.0);
__m128 even = _mm_set_ps(-0.0, 0.0, -0.0, 0.0);
__m128 res0 = _mm_mul_ps(_mm_mul_ps(r1_0001, r2_1122), r3_2333);
__m128 res1 = _mm_mul_ps(_mm_mul_ps(r0_0001, r2_1122), r3_2333);
__m128 res2 = _mm_mul_ps(_mm_mul_ps(r0_0001, r1_1122), r3_2333);
__m128 res3 = _mm_mul_ps(_mm_mul_ps(r0_0001, r1_1122), r2_2333);
res0 = _mm_add_ps(res0, _mm_mul_ps(_mm_mul_ps(r1_1122, r2_2333), r3_0001));
res1 = _mm_add_ps(res1, _mm_mul_ps(_mm_mul_ps(r0_1122, r2_2333), r3_0001));
res2 = _mm_add_ps(res2, _mm_mul_ps(_mm_mul_ps(r0_1122, r1_2333), r3_0001));
res3 = _mm_add_ps(res3, _mm_mul_ps(_mm_mul_ps(r0_1122, r1_2333), r2_0001));
res0 = _mm_add_ps(res0, _mm_mul_ps(_mm_mul_ps(r1_2333, r2_0001), r3_1122));
res1 = _mm_add_ps(res1, _mm_mul_ps(_mm_mul_ps(r0_2333, r2_0001), r3_1122));
res2 = _mm_add_ps(res2, _mm_mul_ps(_mm_mul_ps(r0_2333, r1_0001), r3_1122));
res3 = _mm_add_ps(res3, _mm_mul_ps(_mm_mul_ps(r0_2333, r1_0001), r2_1122));
res0 = _mm_sub_ps(res0, _mm_mul_ps(_mm_mul_ps(r1_2333, r2_1122), r3_0001));
res1 = _mm_sub_ps(res1, _mm_mul_ps(_mm_mul_ps(r0_2333, r2_1122), r3_0001));
res2 = _mm_sub_ps(res2, _mm_mul_ps(_mm_mul_ps(r0_2333, r1_1122), r3_0001));
res3 = _mm_sub_ps(res3, _mm_mul_ps(_mm_mul_ps(r0_2333, r1_1122), r2_0001));
res0 = _mm_sub_ps(res0, _mm_mul_ps(_mm_mul_ps(r1_1122, r2_0001), r3_2333));
res1 = _mm_sub_ps(res1, _mm_mul_ps(_mm_mul_ps(r0_1122, r2_0001), r3_2333));
res2 = _mm_sub_ps(res2, _mm_mul_ps(_mm_mul_ps(r0_1122, r1_0001), r3_2333));
res3 = _mm_sub_ps(res3, _mm_mul_ps(_mm_mul_ps(r0_1122, r1_0001), r2_2333));
res0 = _mm_sub_ps(res0, _mm_mul_ps(_mm_mul_ps(r1_0001, r2_2333), r3_1122));
res1 = _mm_sub_ps(res1, _mm_mul_ps(_mm_mul_ps(r0_0001, r2_2333), r3_1122));
res2 = _mm_sub_ps(res2, _mm_mul_ps(_mm_mul_ps(r0_0001, r1_2333), r3_1122));
res3 = _mm_sub_ps(res3, _mm_mul_ps(_mm_mul_ps(r0_0001, r1_2333), r2_1122));
_mm_store_ps(&dst[0], _mm_xor_ps(res0, even));
_mm_store_ps(&dst[4], _mm_xor_ps(res1, odd));
_mm_store_ps(&dst[8], _mm_xor_ps(res2, even));
_mm_store_ps(&dst[12], _mm_xor_ps(res3, odd));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment