extern crate nalgebra as na;
use na::{Matrix4};
pub fn mul(m0: &Matrix4<f32>, m1: &Matrix4<f32>) -> Matrix4<f32> {
m0 * m1
}
-> output
pub fn mul(m0: &Matrix4<f32>, m1: &Matrix4<f32>) -> Matrix4<f32> {
movups xmm3, xmmword, ptr, [rsi]
movups xmm2, xmmword, ptr, [rsi, +, 16]
movups xmm1, xmmword, ptr, [rsi, +, 32]
movups xmm8, xmmword, ptr, [rsi, +, 48]
movss xmm4, dword, ptr, [rdx]
movss xmm5, dword, ptr, [rdx, +, 4]
shufps xmm4, xmm4, 0
mulps xmm4, xmm3
shufps xmm5, xmm5, 0
mulps xmm5, xmm2
addps xmm5, xmm4
movss xmm6, dword, ptr, [rdx, +, 8]
shufps xmm6, xmm6, 0
mulps xmm6, xmm1
addps xmm6, xmm5
movss xmm4, dword, ptr, [rdx, +, 12]
shufps xmm4, xmm4, 0
mulps xmm4, xmm8
addps xmm4, xmm6
movss xmm5, dword, ptr, [rdx, +, 16]
shufps xmm5, xmm5, 0
mulps xmm5, xmm3
movss xmm6, dword, ptr, [rdx, +, 20]
shufps xmm6, xmm6, 0
mulps xmm6, xmm2
addps xmm6, xmm5
movss xmm7, dword, ptr, [rdx, +, 24]
shufps xmm7, xmm7, 0
mulps xmm7, xmm1
addps xmm7, xmm6
movss xmm5, dword, ptr, [rdx, +, 28]
shufps xmm5, xmm5, 0
mulps xmm5, xmm8
addps xmm5, xmm7
movss xmm6, dword, ptr, [rdx, +, 32]
shufps xmm6, xmm6, 0
mulps xmm6, xmm3
movss xmm7, dword, ptr, [rdx, +, 36]
shufps xmm7, xmm7, 0
mulps xmm7, xmm2
addps xmm7, xmm6
movss xmm0, dword, ptr, [rdx, +, 40]
shufps xmm0, xmm0, 0
mulps xmm0, xmm1
addps xmm0, xmm7
movss xmm6, dword, ptr, [rdx, +, 44]
shufps xmm6, xmm6, 0
mulps xmm6, xmm8
addps xmm6, xmm0
movss xmm0, dword, ptr, [rdx, +, 48]
shufps xmm0, xmm0, 0
mulps xmm0, xmm3
movss xmm3, dword, ptr, [rdx, +, 52]
shufps xmm3, xmm3, 0
mulps xmm3, xmm2
addps xmm3, xmm0
movss xmm0, dword, ptr, [rdx, +, 56]
shufps xmm0, xmm0, 0
mulps xmm0, xmm1
addps xmm0, xmm3
movss xmm1, dword, ptr, [rdx, +, 60]
shufps xmm1, xmm1, 0
mulps xmm1, xmm8
addps xmm1, xmm0
movups xmmword, ptr, [rdi], xmm4
movups xmmword, ptr, [rdi, +, 16], xmm5
movups xmmword, ptr, [rdi, +, 32], xmm6
movups xmmword, ptr, [rdi, +, 48], xmm1
mov rax, rdi
ret
Manual C version
extern void M4x4_SSE(float* dest, const float* A, const float* B) {
__m128 row1 = _mm_load_ps(&B[0]);
__m128 row2 = _mm_load_ps(&B[4]);
__m128 row3 = _mm_load_ps(&B[8]);
__m128 row4 = _mm_load_ps(&B[12]);
__m128 t0 = _mm_load_ps(&A[0]);
__m128 t1 = _mm_load_ps(&A[4]);
__m128 t2 = _mm_load_ps(&A[8]);
__m128 t3 = _mm_load_ps(&A[12]);
__m128 row;
__m128 brod1;
__m128 brod2;
__m128 brod3;
__m128 brod4;
brod1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
brod2 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
brod3 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
brod4 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
row = _mm_add_ps(_mm_add_ps(_mm_mul_ps(brod1, row1), _mm_mul_ps(brod2, row2)), _mm_add_ps( _mm_mul_ps(brod3, row3), _mm_mul_ps(brod4, row4)));
_mm_store_ps(&dest[0], row);
brod1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
brod2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
brod3 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
brod4 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
row = _mm_add_ps(_mm_add_ps(_mm_mul_ps(brod1, row1), _mm_mul_ps(brod2, row2)), _mm_add_ps( _mm_mul_ps(brod3, row3), _mm_mul_ps(brod4, row4)));
_mm_store_ps(&dest[4], row);
brod1 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 0, 0));
brod2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1, 1, 1, 1));
brod3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2, 2, 2, 2));
brod4 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3, 3, 3, 3));
row = _mm_add_ps(_mm_add_ps(_mm_mul_ps(brod1, row1), _mm_mul_ps(brod2, row2)), _mm_add_ps( _mm_mul_ps(brod3, row3), _mm_mul_ps(brod4, row4)));
_mm_store_ps(&dest[8], row);
brod1 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 0, 0));
brod2 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(1, 1, 1, 1));
brod3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2, 2, 2, 2));
brod4 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3, 3, 3, 3));
row = _mm_add_ps(_mm_add_ps(_mm_mul_ps(brod1, row1), _mm_mul_ps(brod2, row2)), _mm_add_ps( _mm_mul_ps(brod3, row3), _mm_mul_ps(brod4, row4)));
_mm_store_ps(&dest[12], row);
}
Output
M4x4_SSE: # @M4x4_SSE
M4x4_SSE: # @M4x4_SSE
.cfi_startproc
# %bb.0:
movaps xmm4, xmmword ptr [rdx]
movaps xmm10, xmmword ptr [rdx + 16]
movaps xmm9, xmmword ptr [rdx + 32]
movaps xmm8, xmmword ptr [rdx + 48]
movaps xmm7, xmmword ptr [rsi]
movaps xmm6, xmmword ptr [rsi + 16]
movaps xmm5, xmmword ptr [rsi + 32]
movaps xmm0, xmmword ptr [rsi + 48]
movaps xmm1, xmm7
shufps xmm1, xmm7, 0 # xmm1 = xmm1[0,0],xmm7[0,0]
movaps xmm2, xmm7
shufps xmm2, xmm7, 85 # xmm2 = xmm2[1,1],xmm7[1,1]
movaps xmm3, xmm7
shufps xmm3, xmm7, 170 # xmm3 = xmm3[2,2],xmm7[2,2]
shufps xmm7, xmm7, 255 # xmm7 = xmm7[3,3,3,3]
mulps xmm1, xmm4
mulps xmm2, xmm10
addps xmm2, xmm1
mulps xmm3, xmm9
mulps xmm7, xmm8
addps xmm7, xmm3
addps xmm7, xmm2
movaps xmmword ptr [rdi], xmm7
movaps xmm1, xmm6
shufps xmm1, xmm6, 0 # xmm1 = xmm1[0,0],xmm6[0,0]
movaps xmm2, xmm6
shufps xmm2, xmm6, 85 # xmm2 = xmm2[1,1],xmm6[1,1]
movaps xmm3, xmm6
shufps xmm3, xmm6, 170 # xmm3 = xmm3[2,2],xmm6[2,2]
shufps xmm6, xmm6, 255 # xmm6 = xmm6[3,3,3,3]
mulps xmm1, xmm4
mulps xmm2, xmm10
addps xmm2, xmm1
mulps xmm3, xmm9
mulps xmm6, xmm8
addps xmm6, xmm3
addps xmm6, xmm2
movaps xmmword ptr [rdi + 16], xmm6
movaps xmm1, xmm5
shufps xmm1, xmm5, 0 # xmm1 = xmm1[0,0],xmm5[0,0]
movaps xmm2, xmm5
shufps xmm2, xmm5, 85 # xmm2 = xmm2[1,1],xmm5[1,1]
movaps xmm3, xmm5
shufps xmm3, xmm5, 170 # xmm3 = xmm3[2,2],xmm5[2,2]
shufps xmm5, xmm5, 255 # xmm5 = xmm5[3,3,3,3]
mulps xmm1, xmm4
mulps xmm2, xmm10
addps xmm2, xmm1
mulps xmm3, xmm9
mulps xmm5, xmm8
addps xmm5, xmm3
addps xmm5, xmm2
movaps xmmword ptr [rdi + 32], xmm5
movaps xmm1, xmm0
shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
movaps xmm2, xmm0
shufps xmm2, xmm0, 85 # xmm2 = xmm2[1,1],xmm0[1,1]
movaps xmm3, xmm0
shufps xmm3, xmm0, 170 # xmm3 = xmm3[2,2],xmm0[2,2]
shufps xmm0, xmm0, 255 # xmm0 = xmm0[3,3,3,3]
mulps xmm1, xmm4
mulps xmm2, xmm10
addps xmm2, xmm1
mulps xmm3, xmm9
mulps xmm0, xmm8
addps xmm0, xmm3
addps xmm0, xmm2
movaps xmmword ptr [rdi + 48], xmm0
ret