Skip to content

Instantly share code, notes, and snippets.

@dfyz
Last active May 6, 2022 06:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dfyz/357eafd2c91a1f15b11761656fd2206a to your computer and use it in GitHub Desktop.
Save dfyz/357eafd2c91a1f15b11761656fd2206a to your computer and use it in GitHub Desktop.

Example AMD output:

$ lscpu | grep 'Model name'
Model name:                      AMD Ryzen Threadripper 3970X 32-Core Processor
$ gcc -DFAST puzzle.S -o puzzle && ./puzzle
1000000000 iterations, 2758479060 cycles, 2.76 cycles/iteration
1651816810 106000000000
$ gcc -DSLOW puzzle.S -o puzzle && ./puzzle
1000000000 iterations, 6551777517 cycles, 6.55 cycles/iteration
1651816814 110000000000

Example Intel output:

$ lscpu | grep 'Model name'
Model name:                      Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz
$ gcc -DFAST puzzle.S -o puzzle && ./puzzle
1000000000 iterations, 6354813654 cycles, 6.35 cycles/iteration
1651817850 122000000000
$ gcc -DSLOW puzzle.S -o puzzle && ./puzzle
1000000000 iterations, 6317220244 cycles, 6.32 cycles/iteration
1651817864 136000000000
# To get the "fast" output:
# $ gcc -DFAST puzzle.S -o puzzle && ./puzzle
# To get the "slow" output:
# $ gcc -DSLOW puzzle.S -o puzzle && ./puzzle
#if defined(FAST)
.set OFFSET1, -44 # FAST
.set OFFSET2, -48 # FAST
#elif defined(SLOW)
.set OFFSET1, -48 # SLOW
.set OFFSET2, -44 # SLOW
#else
#error "Use either -DFAST or -DSLOW"
#endif
.text
_Z9GetCyclesv:
pushq %rbp
movq %rsp, %rbp
rdtsc
salq $32, %rdx
orq %rdx, %rax
nop
popq %rbp
ret
.LC1:
.string "%d iterations, %lu cycles, %4.2f cycles/iteration\n"
.LC2:
.string "%lu %lu\n"
.globl main
main:
pushq %rbp
movq %rsp, %rbp
subq $64, %rsp
movl %edi, -52(%rbp)
movq %rsi, -64(%rbp)
movq $0, -40(%rbp)
movl $0, %edi
# Everything above is irrelevant to performance
# Save the cycle counter before the loop
call _Z9GetCyclesv
movq %rax, -24(%rbp)
# Save the lowest 8 bits of the time() result to OFFSET1(%rpb)
call time@PLT
movq %rax, -32(%rbp)
movq -32(%rbp), %rax
movzbl %al, %eax
movl %eax, OFFSET1(%rbp)
# Zero out OFFSET2(%rbp)
movl $0, OFFSET2(%rbp)
# Main loop starts here
jmp .L5
.L6:
movl OFFSET1(%rbp), %eax
cltq
addq %rax, -40(%rbp)
addl $1, OFFSET2(%rbp)
.L5:
cmpl $999999999, OFFSET2(%rbp)
jle .L6
# Main loop ends here
# Get the number of cycles elapsed
call _Z9GetCyclesv
subq -24(%rbp), %rax
# Everything below is irrelevant to performance
movq %rax, -16(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -16(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
movsd -8(%rbp), %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rcx
movq stdout(%rip), %rax
movq -16(%rbp), %rdx
movq %rcx, %xmm0
movq %rdx, %rcx
movl $1000000000, %edx
leaq .LC1(%rip), %rsi
movq %rax, %rdi
movl $1, %eax
call fprintf@PLT
movq stdout(%rip), %rax
movq -40(%rbp), %rcx
movq -32(%rbp), %rdx
leaq .LC2(%rip), %rsi
movq %rax, %rdi
movl $0, %eax
call fprintf@PLT
movl $0, %eax
leave
ret
.LC0:
.long 0
.long 1104006501
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment