Benchmark of SSE2 sinus calculation (based on intel's Approximate Math Library) vs FPU fsin instruction
include ''
entry main
pow = 20
section '.code' code readable executable
proc main
mov esi, 1 shl pow
mov dword[time], eax
mov dword[time + 4], edx
repeat 50
movss xmm0, [src]
movss [esp - 4], xmm0
movss xmm1, [_ps_am_inv_sign_mask]
mov eax, [esp - 4]
mulss xmm0, [_ps_am_2_o_pi]
andps xmm0, xmm1
and eax, 0x80000000
cvttss2si ecx, xmm0
movss xmm1, [_ps_am_1]
mov edx, ecx
shl edx, (31 - 1)
cvtsi2ss xmm2, ecx
and ecx, 0x1
and edx, 0x80000000
subss xmm0, xmm2
movss xmm6, [_sincos_masks + ecx * 4]
minss xmm0, xmm1
movss xmm5, [_ps_sincos_p3]
subss xmm1, xmm0
andps xmm1, xmm6
andnps xmm6, xmm0
orps xmm1, xmm6
movss xmm4, [_ps_sincos_p2]
movss xmm0, xmm1
mulss xmm1, xmm1
movss xmm7, [_ps_sincos_p1]
xor eax, edx
movss xmm2, xmm1
mulss xmm1, xmm5
movss xmm5, [_ps_sincos_p0]
mov [esp - 4], eax
addss xmm1, xmm4
mulss xmm1, xmm2
movss xmm3, [esp - 4]
addss xmm1, xmm7
mulss xmm1, xmm2
orps xmm0, xmm3
addss xmm1, xmm5
mulss xmm0, xmm1
cvtss2sd xmm0, xmm0
movlpd [dest], xmm0
end repeat
sub eax, dword[time]
sbb edx, dword[time + 4]
add dword[total_time], eax
adc dword[total_time + 4], edx
dec esi
jnz .loop
fild [total_time]
fidiv [divv]
fstp [total_time]
cinvoke printf, fmt1, double[total_time], double[dest]
xorps xmm0, xmm0
movlpd [total_time], xmm0
mov esi, 1 shl pow
mov dword[time], eax
mov dword[time + 4], edx
repeat 50
fld [src]
fstp [dest]
end repeat
sub eax, dword[time]
sbb edx, dword[time + 4]
add dword[total_time], eax
adc dword[total_time + 4], edx
dec esi
jnz .loop2
fild [total_time]
fidiv [divv]
fstp [total_time]
cinvoke printf, fmt2, double[total_time], double[dest]
cinvoke getch
section '.data' data readable writeable
time dq 0
total_time dq 0
divv dd 1 shl pow
fmt1 db 'SSE2 %lf sin(45 degree): %lf', 10, 0
fmt2 db 'FPU %lf sin(45 degree): %lf', 10, 0
src dd 0.78539816339744830961566084581988
dest dq ?
align 4
_ps_am_inv_sign_mask dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
_ps_am_pi_o_2 dd 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
_ps_am_2_o_pi dd 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
_epi32_1 dd 1, 1, 1, 1
_ps_am_1 dd 1.0, 1.0, 1.0, 1.0
_epi32_2 dd 2, 2, 2, 2
_ps_sincos_p3 dd -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
_ps_sincos_p2 dd 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
_ps_sincos_p1 dd -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621
_ps_sincos_p0 dd 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
_sincos_masks dd 0x0, not 0x0
section '.idata' import data readable writeable
library msvcrt, 'msvcrt.dll'
import msvcrt,\
printf, 'printf',\
getch, '_getch'
