Skip to content

Instantly share code, notes, and snippets.

@RIscRIpt
Created June 23, 2014 12:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RIscRIpt/315d331bd44753e0175c to your computer and use it in GitHub Desktop.
Save RIscRIpt/315d331bd44753e0175c to your computer and use it in GitHub Desktop.
Benchmark of SSE2 sinus calculation (based on intel's Approximate Math Library) vs FPU fsin instruction
format PE CONSOLE
include 'win32ax.inc'
entry main
pow = 20
section '.code' code readable executable
proc main
mov esi, 1 shl pow
.loop:
rdtsc
mov dword[time], eax
mov dword[time + 4], edx
repeat 50
movss xmm0, [src]
;unedited
movss [esp - 4], xmm0
movss xmm1, [_ps_am_inv_sign_mask]
mov eax, [esp - 4]
mulss xmm0, [_ps_am_2_o_pi]
andps xmm0, xmm1
and eax, 0x80000000
cvttss2si ecx, xmm0
movss xmm1, [_ps_am_1]
mov edx, ecx
shl edx, (31 - 1)
cvtsi2ss xmm2, ecx
and ecx, 0x1
and edx, 0x80000000
subss xmm0, xmm2
movss xmm6, [_sincos_masks + ecx * 4]
minss xmm0, xmm1
movss xmm5, [_ps_sincos_p3]
subss xmm1, xmm0
andps xmm1, xmm6
andnps xmm6, xmm0
orps xmm1, xmm6
movss xmm4, [_ps_sincos_p2]
movss xmm0, xmm1
mulss xmm1, xmm1
movss xmm7, [_ps_sincos_p1]
xor eax, edx
movss xmm2, xmm1
mulss xmm1, xmm5
movss xmm5, [_ps_sincos_p0]
mov [esp - 4], eax
addss xmm1, xmm4
mulss xmm1, xmm2
movss xmm3, [esp - 4]
addss xmm1, xmm7
mulss xmm1, xmm2
orps xmm0, xmm3
addss xmm1, xmm5
mulss xmm0, xmm1
cvtss2sd xmm0, xmm0
movlpd [dest], xmm0
end repeat
rdtsc
sub eax, dword[time]
sbb edx, dword[time + 4]
add dword[total_time], eax
adc dword[total_time + 4], edx
dec esi
jnz .loop
fild [total_time]
fidiv [divv]
fstp [total_time]
cinvoke printf, fmt1, double[total_time], double[dest]
xorps xmm0, xmm0
movlpd [total_time], xmm0
finit
mov esi, 1 shl pow
.loop2:
rdtsc
mov dword[time], eax
mov dword[time + 4], edx
repeat 50
fld [src]
fsin
fstp [dest]
end repeat
rdtsc
sub eax, dword[time]
sbb edx, dword[time + 4]
add dword[total_time], eax
adc dword[total_time + 4], edx
dec esi
jnz .loop2
fild [total_time]
fidiv [divv]
fstp [total_time]
cinvoke printf, fmt2, double[total_time], double[dest]
.below:
cinvoke getch
ret
endp
section '.data' data readable writeable
time dq 0
total_time dq 0
divv dd 1 shl pow
fmt1 db 'SSE2 %lf sin(45 degree): %lf', 10, 0
fmt2 db 'FPU %lf sin(45 degree): %lf', 10, 0
src dd 0.78539816339744830961566084581988
dest dq ?
align 4
_ps_am_inv_sign_mask dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
_ps_am_pi_o_2 dd 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
_ps_am_2_o_pi dd 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
_epi32_1 dd 1, 1, 1, 1
_ps_am_1 dd 1.0, 1.0, 1.0, 1.0
_epi32_2 dd 2, 2, 2, 2
_ps_sincos_p3 dd -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
_ps_sincos_p2 dd 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
_ps_sincos_p1 dd -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621
_ps_sincos_p0 dd 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
_sincos_masks dd 0x0, not 0x0
section '.idata' import data readable writeable
library msvcrt, 'msvcrt.dll'
import msvcrt,\
printf, 'printf',\
getch, '_getch'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment