Skip to content

Instantly share code, notes, and snippets.

@jin-x
Created July 24, 2021 17:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jin-x/88358e9bb1d58d01b7a318d0873208e3 to your computer and use it in GitHub Desktop.
Save jin-x/88358e9bb1d58d01b7a318d0873208e3 to your computer and use it in GitHub Desktop.
Code Speed Measurement Tool
; Speed Test for Windows x64, v1.01 / fasm 1
; (c) 2020 Jin X (jin_x@list.ru)
format PE64 Console 5.0
include 'win64axp.inc'
define REQUIRE_ADMIN_RIGHTS 1 ; 1 - run with the highest (realtime) priority, 0 - run with just high priority
;-- CODE SECTION -------------------------------------------------------------------------------------------------------
.code
entry:
frame
; -1 is handle of current process (GetCurrentProcess), -2 is handle of current thread (GetCurrentThread)
invoke SetProcessAffinityMask, -1, 1 ; to aviod CPU migration
invoke SetPriorityClass, -1, REALTIME_PRIORITY_CLASS ; available only when running with administrator rights
invoke SetThreadPriority, -2, THREAD_PRIORITY_TIME_CRITICAL
stdcall SpeedTestInit, -1
stdcall SpeedTestMsg, test1, 'Testing 1...', <' %llu ticks%s',10>, ' (CPU migration is detected)'
stdcall SpeedTestMsg, test2, 'Testing 2...', <' %llu ticks%s',10>, ' (CPU migration is detected)'
stdcall SpeedTestMsg, test3, 'Testing 3...', <' %llu ticks%s',10>, ' (CPU migration is detected)'
stdcall SpeedTestMsg, test4, 'Testing 4...', <' %llu ticks%s',10>, ' (CPU migration is detected)'
invoke SetThreadPriority, -2, THREAD_PRIORITY_NORMAL
invoke SetPriorityClass, -1, NORMAL_PRIORITY_CLASS
cinvoke printf, 'Press a key to exit...'
cinvoke getch
invoke ExitProcess, 0
endf
align 16
test1: ; Procedure under test 1
ret
align 16
test2: ; Procedure under test 2
xor eax,eax
cpuid
ret
align 16
test3: ; Procedure under test 3
mov ecx,65536
@@: dec ecx
jnz @B
ret
align 16
test4: ; Procedure under test 4
mov ecx,65536
loop $
ret
;-- SPEED TEST PROCEDURES ----------------------------------------------------------------------------------------------
SPEEDTEST_REPEATS = 4096 ; number of code execution repeats (must be power of two!!!)
SPEEDTEST_WARMUPS = 1 shl (bsr SPEEDTEST_REPEATS / 2) ; number of warming-up executions
assert SPEEDTEST_WARMUPS >= 0 & SPEEDTEST_REPEATS > 0 & bsf SPEEDTEST_REPEATS = bsr SPEEDTEST_REPEATS
; Initialize speed-test and show message if needed (via printf)
; Parameters: ecx = show message flags: bit 0 - when rdtscp is NOT supported, bit 1 - when invariant TSC is NOT supported, bit 2 - when everything's ok (ecx = -1 - all messages)
; Returns: rax = unsupported feature flags: bit 0 - rdtscp is NOT supported, bit 1 - invariant TSC is NOT supported (eax = 0 - both features are supported)
proc SpeedTestInit uses rbx, MsgFlags
frame
mov r8b,3 ; temp result
mov r9d,ecx ; show message flags
mov eax,0x80000000
cpuid
mov r10d,eax ; max extended cpuid leaf level
; RDTSCP instruction support check
mov eax,0x80000001
cmp r10d,eax
jb .no_inv ; both features are NOT supported
cpuid
bt edx,27 ; rdtscp support bit
jnc .no_rdtscp
and r8b,not 1 ; mark as supported
mov [SpeedTestGetTSC],SpeedTestRDTSCP ; use RDTSC instruction
.no_rdtscp:
; Invariant TSC support check
mov eax,0x80000007
cmp r10d,eax
jb .no_inv
cpuid
bt edx,8 ; invariant TSC support bit
jnc .no_inv
and r8b,not 2 ; mark as supported
.no_inv:
mov bl,r8b
mov bh,bl ; save result mask
test bl,bl
setz cl
shl cl,2
or bl,cl ; set bit 2 in r8d if both features are supported
and bl,r9b ; bit mask for messages
; Messages
test bl,1
jz @F
cinvoke printf, <"Warning: RDTSCP instruction is not supported, RDTSC will be used instead (CPU migration can't be detected)!", 10>
@@: test bl,2
jz @F
cinvoke printf, <"Warning: invariant TSC is not supported (results may be inaccurate)!", 10>
@@: test bl,4
jz @F
cinvoke printf, <"Success: both RDTSCP instruction and invariant TSC are supported.", 10>
@@:
; Measure overhead
xor eax,eax
mov [SpeedTestOverhead],rax
stdcall SpeedTest, SpeedTestEmptyFunc ; overhead test
mov [SpeedTestOverhead],rax
movzx eax,bh ; results
ret
endf
endp ; SpeedTestInit
; Measure procedure speed and show message (via printf)
; Parameters:
; * rcx = procedure address
; * rdx = starting message address (0 - no message, -1 - 'Testing...' message);
; * r8 = result message address (0 - no message, -1 - just a number of ticks and new line), must contain '%llu' for result TSC count and then '%s' (optional) for CPU migration message (specified by r9);
; * r9 = CPU migration message address (optional, must be used only is r8 message contains '%s').
; Returns: rax = TSC count (always positive value), zf = 1 if no CPU migration is occured
proc SpeedTestMsg ProcAddr, PreMsg, ResultMsg, MigMsg
SpeedTestMsg% = 0 ; turn off parameter count check
frame
mov [ProcAddr],rcx
mov [ResultMsg],r8
mov [MigMsg],r9
; Starting message
test rdx,rdx
jz .no_start
cmp rdx,-1
jne @F
mov rdx,.testing_msg
@@: cinvoke printf, '%s', rdx
.no_start:
; Test speed
stdcall SpeedTest, [ProcAddr]
mov [ProcAddr],rax
setz byte [PreMsg] ; save zf
; Result message
mov r8,.no_message
jz @F ; jump if no CPU migration
mov r8,[MigMsg]
@@: mov rcx,[ResultMsg]
test rcx,rcx
jz .no_results
cmp rcx,-1
jne @F
mov rcx,.just_ticks
@@: cinvoke printf, rcx, rax, r8
.no_results:
; Return values
mov rax,[ProcAddr]
dec byte [PreMsg] ; restore zf
ret
endf
.testing_msg db 'Testing...',0
.just_ticks db ' %llu',10
.no_message db 0
endp ; SpeedTestMsg
; Measure procedure speed
; Parameters: rcx = procedure address
; Returns: rax = TSC count (always positive value), zf = 1 if no CPU migration is occured
proc SpeedTest uses rbx rsi rdi r12 r13 r14 r15, ProcAddr
frame
mov r12,rcx
; Warming-up calls
if SPEEDTEST_WARMUPS > 0
mov esi,SPEEDTEST_WARMUPS
@@: stdcall r12
dec esi
jnz @B
end if
; Main tests
cld
mov rdi,SpeedTestResults
xor r15d,r15d
mov esi,SPEEDTEST_REPEATS
align 16
@@:; invoke SwitchToThread ; try to update thread time slice
invoke SpeedTestGetTSC ; get ticks in rax, CPU id in ecx
mov r13,rax
mov r14d,ecx
stdcall r12 ; main call
invoke SpeedTestGetTSC ; get ticks in rax, CPU id in ecx
sub rax,r13
sub rax,[SpeedTestOverhead] ; result TSC count
stosq ; store to SpeedTestResults
sub ecx,r14d ; detect CPU migration
or r15d,ecx ; migration flag for all tests
dec esi
jnz @B
if SPEEDTEST_REPEATS > 2
; Sort results
mov rcx,SpeedTestResults
mov rdx,SPEEDTEST_REPEATS
stdcall InsertionSort64
; Calculate average CPU ticks
mov ecx,SPEEDTEST_REPEATS/2 ; use only 50% of results from array middle (assuming that 25% at the start and end are errors)
else
mov ecx,SPEEDTEST_REPEATS
end if
xor eax,eax
xor edx,edx
@@: add rax,[SpeedTestResults+(SPEEDTEST_REPEATS/4)*8 + rdx*8] ; sum of all relevant results
inc edx
dec ecx
jnz @B
if SPEEDTEST_REPEATS > 2
sar rax,bsr (SPEEDTEST_REPEATS/2) ; average value
else if SPEEDTEST_REPEATS = 2
sar rax,bsr SPEEDTEST_REPEATS ; average value
test rax,rax
end if
cmovs eax,ecx ; zero result if negative
test r15d,r15d ; zf = 1 if no CPU migration is occured
ret
endf
endp ; SpeedTest
; Read TSC via RDTSC [for internal use]
; Returns: rax = current TSC counter value, ecx = 0 (processor id detection is not supported)
; Changes ebx !!!
if used SpeedTestRDTSC
SpeedTestRDTSC:
xor eax,eax ; cpuid execution time may vary depending on eax value
cpuid ; serialization
xor ecx,ecx ; processor id (not supported)
rdtsc
shl rdx,32
or rax,rdx
mfence
ret
end if ; used SpeedTestRDTSC
; Read TSC via RDTSCP [for internal use]
; Returns: rax = current TSC counter value, ecx = processor id
if used SpeedTestRDTSCP
SpeedTestRDTSCP:
rdtscp
shl rdx,32
or rax,rdx
mfence
SpeedTestEmptyFunc:
ret
end if ; used SpeedTestRDTSCP
if used InsertionSort64
; Insertion sort of 64-bit elements
; Parameters: rcx = array address, rdx = number of elements
InsertionSort64:
mov r8d,1 ; start key_index
cmp rdx,r8
jle .exit ; jump if number of element <= 1
.loop1:
mov rax,[rcx+r8*8] ; key
mov r9,r8 ; el_index
.loop2:
mov r10,[rcx+(r9-1)*8] ; prev_el
cmp r10,rax ; prev_el <=> key ?
jng @F
mov [rcx+r9*8],r10 ; if (prev_el > key) el = prev_el
dec r9 ; --el_index
jnz .loop2 ; repeat if el_index > 0
@@:
mov [rcx+r9*8],rax ; el = key
inc r8 ; ++key_index
cmp r8,rdx
jb .loop1 ; repeat if key_index < number of elements
.exit:
ret
end if ; used InsertionSort64
;-- DATA SECTION -------------------------------------------------------------------------------------------------------
.data
if used SpeedTestInit
align 16
SpeedTestGetTSC dq SpeedTestRDTSC ; TSC read procedure
SpeedTestOverhead rq 1 ; TSC read overhead tick count
SpeedTestResults rq SPEEDTEST_REPEATS ; Temporary result array
end if ; used SpeedTestInit
;-- REQUIRE ADMIN RIGHTS -----------------------------------------------------------------------------------------------
match =1, REQUIRE_ADMIN_RIGHTS
{
section '.rsrc' data readable resource
directory RT_MANIFEST, manifest
resource manifest, 1, LANG_NEUTRAL, require_admin_rights
resdata require_admin_rights
db '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
db '<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">'
db '<assemblyIdentity version="1.0.0.0" name="." type="win32"/>'
db '<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2"><security><requestedPrivileges>'
db '<requestedExecutionLevel level="requireAdministrator" uiAccess="false"/>'
db '</requestedPrivileges></security></trustInfo></assembly>'
endres
}
;-- IMPORT SECTION -----------------------------------------------------------------------------------------------------
section '.idata' import data readable
library kernel32, 'kernel32.dll',\
msvcrt, 'msvcrt.dll'
import_kernel32
all_api
import msvcrt,\
printf, 'printf',\
getch, '_getch'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment