Assembly generated from code implemented here
mov rax, rdi
vmovaps xmm0, xmmword ptr [rsi]
vmovaps xmm1, xmmword ptr [rsi + 16]
vpermilps xmm2, xmm0, 120 # xmm2 = xmm0[0,2,3,1]
vmulps xmm3, xmm0, xmm2
vpermilps xmm4, xmm0, 0 # xmm4 = xmm0[0,0,0,0]
vpermilps xmm5, xmm0, 156 # xmm5 = xmm0[0,3,1,2]
vmulps xmm6, xmm4, xmm5
vsubps xmm3, xmm3, xmm6
vmovaps xmm6, xmmword ptr [rip + .LCPI1_0] # xmm6 = [0.0E+0,2.0E+0,2.0E+0,2.0E+0]
vmulps xmm8, xmm3, xmm6
vmulps xmm7, xmm4, xmm2
vmulps xmm3, xmm0, xmm5
vaddps xmm3, xmm7, xmm3
vmulps xmm9, xmm3, xmm6
vmulps xmm0, xmm0, xmm0
vpermilps xmm7, xmm0, 0 # xmm7 = xmm0[0,0,0,0]
vpermilps xmm3, xmm0, 229 # xmm3 = xmm0[1,1,2,3]
vaddps xmm3, xmm7, xmm3
vpermilps xmm7, xmm0, 90 # xmm7 = xmm0[2,2,1,1]
vpermilps xmm0, xmm0, 191 # xmm0 = xmm0[3,3,3,2]
vaddps xmm0, xmm7, xmm0
vmulps xmm0, xmm0, xmmword ptr [rip + .LCPI1_1]
vaddps xmm0, xmm3, xmm0
vpermilps xmm3, xmm1, 216 # xmm3 = xmm1[0,2,1,3]
vmulps xmm3, xmm5, xmm3
vpermilps xmm5, xmm1, 108 # xmm5 = xmm1[0,3,2,1]
vmulps xmm4, xmm4, xmm5
vsubps xmm3, xmm3, xmm4
vpermilps xmm1, xmm1, 180 # xmm1 = xmm1[0,1,3,2]
vmulps xmm1, xmm2, xmm1
vsubps xmm1, xmm3, xmm1
vmulps xmm1, xmm1, xmm6
vmovaps xmm2, xmmword ptr [rdx]
vpermilps xmm3, xmm2, 120 # xmm3 = xmm2[0,2,3,1]
vmulps xmm3, xmm3, xmm8
vpermilps xmm4, xmm2, 156 # xmm4 = xmm2[0,3,1,2]
vmulps xmm4, xmm4, xmm9
vaddps xmm3, xmm3, xmm4
vmulps xmm0, xmm2, xmm0
vaddps xmm0, xmm3, xmm0
vpermilps xmm2, xmm2, 0 # xmm2 = xmm2[0,0,0,0]
vmulps xmm1, xmm2, xmm1
vaddps xmm0, xmm0, xmm1
vmovaps xmmword ptr [rdi], xmm0
ret
mov rax, rdi
vmovaps xmm2, xmmword ptr [rsi]
vmovaps xmm0, xmmword ptr [rsi + 16]
vpermilps xmm3, xmm2, 0 # xmm3 = xmm2[0,0,0,0]
vpermilps xmm1, xmm2, 30 # xmm1 = xmm2[2,3,1,0]
vmulps xmm4, xmm3, xmm1
vpermilps xmm5, xmm2, 101 # xmm5 = xmm2[1,1,2,1]
vpermilps xmm6, xmm2, 123 # xmm6 = xmm2[3,2,3,1]
vmulps xmm5, xmm5, xmm6
vaddps xmm4, xmm4, xmm5
vmovaps xmm9, xmmword ptr [rip + .LCPI0_0] # xmm9 = [2.0E+0,2.0E+0,2.0E+0,1.0E+0]
vmulps xmm8, xmm4, xmm9
vpermilps xmm6, xmm2, 153 # xmm6 = xmm2[1,2,1,2]
vpermilps xmm7, xmm2, 190 # xmm7 = xmm2[2,3,3,2]
vmulps xmm6, xmm6, xmm7
vpermilps xmm7, xmm2, 192 # xmm7 = xmm2[0,0,0,3]
vpermilps xmm4, xmm2, 231 # xmm4 = xmm2[3,1,2,3]
vmulps xmm4, xmm7, xmm4
vmulps xmm4, xmm4, xmmword ptr [rip + .LCPI0_1]
vaddps xmm4, xmm6, xmm4
vmulps xmm4, xmm4, xmm9
vmulps xmm6, xmm2, xmm2
vpermilps xmm7, xmm6, 0 # xmm7 = xmm6[0,0,0,0]
vpermilps xmm5, xmm6, 121 # xmm5 = xmm6[1,2,3,1]
vaddps xmm5, xmm7, xmm5
vpermilps xmm7, xmm6, 150 # xmm7 = xmm6[2,1,1,2]
vsubps xmm5, xmm5, xmm7
vpermilps xmm6, xmm6, 239 # xmm6 = xmm6[3,3,2,3]
vsubps xmm5, xmm5, xmm6
vmulps xmm5, xmm5, xmmword ptr [rip + .LCPI0_2]
vpermilps xmm6, xmm0, 27 # xmm6 = xmm0[3,2,1,0]
vmulps xmm3, xmm3, xmm6
vpermilps xmm2, xmm2, 39 # xmm2 = xmm2[3,1,2,0]
vpermilps xmm6, xmm0, 54 # xmm6 = xmm0[2,1,3,0]
vmulps xmm2, xmm2, xmm6
vaddps xmm2, xmm3, xmm2
vpermilps xmm0, xmm0, 45 # xmm0 = xmm0[1,3,2,0]
vmulps xmm0, xmm1, xmm0
vsubps xmm0, xmm2, xmm0
vmulps xmm0, xmm0, xmm9
vmovaps xmm1, xmmword ptr [rdx]
vpermilps xmm2, xmm1, 210 # xmm2 = xmm1[2,0,1,3]
vmulps xmm2, xmm2, xmm8
vpermilps xmm3, xmm1, 201 # xmm3 = xmm1[1,2,0,3]
vmulps xmm3, xmm3, xmm4
vaddps xmm2, xmm2, xmm3
vdpps xmm0, xmm0, xmm1, 120
vmulps xmm1, xmm1, xmm5
vaddps xmm1, xmm2, xmm1
vaddps xmm0, xmm0, xmm1
vmovaps xmmword ptr [rdi], xmm0
mov rax, rdi
movaps xmm0, xmmword ptr [rsi]
movaps xmm2, xmmword ptr [rsi + 16]
movaps xmm9, xmm0
shufps xmm9, xmm0, 120 # xmm9 = xmm9[0,2],xmm0[3,1]
movaps xmm1, xmm0
mulps xmm1, xmm9
movaps xmm5, xmm0
shufps xmm5, xmm0, 0 # xmm5 = xmm5[0,0],xmm0[0,0]
movaps xmm4, xmm0
shufps xmm4, xmm0, 156 # xmm4 = xmm4[0,3],xmm0[1,2]
movaps xmm6, xmm5
movaps xmm7, xmm2
shufps xmm7, xmm2, 108 # xmm7 = xmm7[0,3],xmm2[2,1]
mulps xmm7, xmm5
mulps xmm5, xmm4
subps xmm1, xmm5
movaps xmm8, xmmword ptr [rip + .LCPI1_0] # xmm8 = [0.0E+0,2.0E+0,2.0E+0,2.0E+0]
mulps xmm1, xmm8
mulps xmm6, xmm9
movaps xmm5, xmm0
mulps xmm5, xmm4
addps xmm5, xmm6
mulps xmm5, xmm8
mulps xmm0, xmm0
movaps xmm6, xmm0
shufps xmm6, xmm0, 0 # xmm6 = xmm6[0,0],xmm0[0,0]
movaps xmm3, xmm0
shufps xmm3, xmm0, 229 # xmm3 = xmm3[1,1],xmm0[2,3]
addps xmm3, xmm6
movaps xmm6, xmm0
shufps xmm6, xmm0, 90 # xmm6 = xmm6[2,2],xmm0[1,1]
shufps xmm0, xmm0, 191 # xmm0 = xmm0[3,3,3,2]
addps xmm0, xmm6
mulps xmm0, xmmword ptr [rip + .LCPI1_1]
addps xmm0, xmm3
movaps xmm3, xmm2
shufps xmm3, xmm2, 216 # xmm3 = xmm3[0,2],xmm2[1,3]
mulps xmm3, xmm4
subps xmm3, xmm7
shufps xmm2, xmm2, 180 # xmm2 = xmm2[0,1,3,2]
mulps xmm2, xmm9
subps xmm3, xmm2
mulps xmm3, xmm8
movaps xmm2, xmmword ptr [rdx]
movaps xmm4, xmm2
shufps xmm4, xmm2, 120 # xmm4 = xmm4[0,2],xmm2[3,1]
mulps xmm4, xmm1
movaps xmm1, xmm2
shufps xmm1, xmm2, 156 # xmm1 = xmm1[0,3],xmm2[1,2]
mulps xmm1, xmm5
addps xmm1, xmm4
mulps xmm0, xmm2
addps xmm0, xmm1
shufps xmm2, xmm2, 0 # xmm2 = xmm2[0,0,0,0]
mulps xmm2, xmm3
addps xmm2, xmm0
movaps xmmword ptr [rdi], xmm2
ret
mov rax, rdi
movaps xmm1, xmmword ptr [rsi]
movaps xmm0, xmmword ptr [rsi + 16]
movaps xmm3, xmm1
shufps xmm3, xmm1, 0 # xmm3 = xmm3[0,0],xmm1[0,0]
movaps xmm9, xmm1
shufps xmm9, xmm1, 30 # xmm9 = xmm9[2,3],xmm1[1,0]
movaps xmm4, xmm0
shufps xmm4, xmm0, 27 # xmm4 = xmm4[3,2],xmm0[1,0]
mulps xmm4, xmm3
mulps xmm3, xmm9
movaps xmm5, xmm1
shufps xmm5, xmm1, 101 # xmm5 = xmm5[1,1],xmm1[2,1]
movaps xmm6, xmm1
shufps xmm6, xmm1, 123 # xmm6 = xmm6[3,2],xmm1[3,1]
mulps xmm6, xmm5
addps xmm6, xmm3
movaps xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 = [2.0E+0,2.0E+0,2.0E+0,1.0E+0]
mulps xmm6, xmm8
movaps xmm5, xmm1
shufps xmm5, xmm1, 153 # xmm5 = xmm5[1,2],xmm1[1,2]
movaps xmm7, xmm1
shufps xmm7, xmm1, 190 # xmm7 = xmm7[2,3],xmm1[3,2]
mulps xmm7, xmm5
movaps xmm5, xmm1
shufps xmm5, xmm1, 195 # xmm5 = xmm5[3,0],xmm1[0,3]
mulps xmm5, xmm1
mulps xmm5, xmmword ptr [rip + .LCPI0_1]
addps xmm5, xmm7
mulps xmm5, xmm8
movaps xmm3, xmm1
mulps xmm3, xmm1
movaps xmm2, xmm3
shufps xmm2, xmm3, 0 # xmm2 = xmm2[0,0],xmm3[0,0]
movaps xmm7, xmm3
shufps xmm7, xmm3, 121 # xmm7 = xmm7[1,2],xmm3[3,1]
addps xmm7, xmm2
movaps xmm2, xmm3
shufps xmm2, xmm3, 150 # xmm2 = xmm2[2,1],xmm3[1,2]
subps xmm7, xmm2
shufps xmm3, xmm3, 239 # xmm3 = xmm3[3,3,2,3]
subps xmm7, xmm3
mulps xmm7, xmmword ptr [rip + .LCPI0_2]
shufps xmm1, xmm1, 39 # xmm1 = xmm1[3,1,2,0]
movaps xmm2, xmm0
shufps xmm2, xmm0, 54 # xmm2 = xmm2[2,1],xmm0[3,0]
mulps xmm2, xmm1
addps xmm2, xmm4
shufps xmm0, xmm0, 45 # xmm0 = xmm0[1,3,2,0]
mulps xmm0, xmm9
subps xmm2, xmm0
mulps xmm2, xmm8
movaps xmm0, xmmword ptr [rdx]
movaps xmm1, xmm0
mulps xmm7, xmm0
dpps xmm2, xmm0, 120
shufps xmm0, xmm0, 210 # xmm0 = xmm0[2,0,1,3]
mulps xmm0, xmm6
shufps xmm1, xmm1, 201 # xmm1 = xmm1[1,2,0,3]
mulps xmm1, xmm5
addps xmm1, xmm0
addps xmm7, xmm1
addps xmm2, xmm7
movaps xmmword ptr [rdi], xmm2
ret