Created
June 23, 2014 12:35
-
-
Save RIscRIpt/315d331bd44753e0175c to your computer and use it in GitHub Desktop.
Benchmark of SSE2 sinus calculation (based on intel's Approximate Math Library) vs FPU fsin instruction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
format PE CONSOLE | |
include 'win32ax.inc' | |
entry main | |
pow = 20 | |
section '.code' code readable executable | |
proc main | |
mov esi, 1 shl pow | |
.loop: | |
rdtsc | |
mov dword[time], eax | |
mov dword[time + 4], edx | |
repeat 50 | |
movss xmm0, [src] | |
;unedited | |
movss [esp - 4], xmm0 | |
movss xmm1, [_ps_am_inv_sign_mask] | |
mov eax, [esp - 4] | |
mulss xmm0, [_ps_am_2_o_pi] | |
andps xmm0, xmm1 | |
and eax, 0x80000000 | |
cvttss2si ecx, xmm0 | |
movss xmm1, [_ps_am_1] | |
mov edx, ecx | |
shl edx, (31 - 1) | |
cvtsi2ss xmm2, ecx | |
and ecx, 0x1 | |
and edx, 0x80000000 | |
subss xmm0, xmm2 | |
movss xmm6, [_sincos_masks + ecx * 4] | |
minss xmm0, xmm1 | |
movss xmm5, [_ps_sincos_p3] | |
subss xmm1, xmm0 | |
andps xmm1, xmm6 | |
andnps xmm6, xmm0 | |
orps xmm1, xmm6 | |
movss xmm4, [_ps_sincos_p2] | |
movss xmm0, xmm1 | |
mulss xmm1, xmm1 | |
movss xmm7, [_ps_sincos_p1] | |
xor eax, edx | |
movss xmm2, xmm1 | |
mulss xmm1, xmm5 | |
movss xmm5, [_ps_sincos_p0] | |
mov [esp - 4], eax | |
addss xmm1, xmm4 | |
mulss xmm1, xmm2 | |
movss xmm3, [esp - 4] | |
addss xmm1, xmm7 | |
mulss xmm1, xmm2 | |
orps xmm0, xmm3 | |
addss xmm1, xmm5 | |
mulss xmm0, xmm1 | |
cvtss2sd xmm0, xmm0 | |
movlpd [dest], xmm0 | |
end repeat | |
rdtsc | |
sub eax, dword[time] | |
sbb edx, dword[time + 4] | |
add dword[total_time], eax | |
adc dword[total_time + 4], edx | |
dec esi | |
jnz .loop | |
fild [total_time] | |
fidiv [divv] | |
fstp [total_time] | |
cinvoke printf, fmt1, double[total_time], double[dest] | |
xorps xmm0, xmm0 | |
movlpd [total_time], xmm0 | |
finit | |
mov esi, 1 shl pow | |
.loop2: | |
rdtsc | |
mov dword[time], eax | |
mov dword[time + 4], edx | |
repeat 50 | |
fld [src] | |
fsin | |
fstp [dest] | |
end repeat | |
rdtsc | |
sub eax, dword[time] | |
sbb edx, dword[time + 4] | |
add dword[total_time], eax | |
adc dword[total_time + 4], edx | |
dec esi | |
jnz .loop2 | |
fild [total_time] | |
fidiv [divv] | |
fstp [total_time] | |
cinvoke printf, fmt2, double[total_time], double[dest] | |
.below: | |
cinvoke getch | |
ret | |
endp | |
section '.data' data readable writeable | |
time dq 0 | |
total_time dq 0 | |
divv dd 1 shl pow | |
fmt1 db 'SSE2 %lf sin(45 degree): %lf', 10, 0 | |
fmt2 db 'FPU %lf sin(45 degree): %lf', 10, 0 | |
src dd 0.78539816339744830961566084581988 | |
dest dq ? | |
align 4 | |
_ps_am_inv_sign_mask dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF | |
_ps_am_pi_o_2 dd 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 | |
_ps_am_2_o_pi dd 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 | |
_epi32_1 dd 1, 1, 1, 1 | |
_ps_am_1 dd 1.0, 1.0, 1.0, 1.0 | |
_epi32_2 dd 2, 2, 2, 2 | |
_ps_sincos_p3 dd -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 | |
_ps_sincos_p2 dd 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 | |
_ps_sincos_p1 dd -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 | |
_ps_sincos_p0 dd 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 | |
_sincos_masks dd 0x0, not 0x0 | |
section '.idata' import data readable writeable | |
library msvcrt, 'msvcrt.dll' | |
import msvcrt,\ | |
printf, 'printf',\ | |
getch, '_getch' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment