public
Last active

  • Download Gist
triad.s
GAS
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
;; With prefetch and NT stores:
 
0000000000401608 <main+0xac8> add esi,0x4
000000000040160b <main+0xacb> add rcx,0x20
000000000040160f <main+0xacf> cmp esi,0x7a1200
0000000000401615 <main+0xad5> je 00000000004016a9 <main+0xb69>
000000000040161b <main+0xadb> mov rax,QWORD PTR [rsp+0x748]
0000000000401623 <main+0xae3> movapd xmm0,xmm3
0000000000401627 <main+0xae7> mulpd xmm0,XMMWORD PTR [rax+rcx*1]
000000000040162c <main+0xaec> movapd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401632 <main+0xaf2> mov rax,QWORD PTR [rsp+0x750]
000000000040163a <main+0xafa> mulpd xmm1,xmm3
000000000040163e <main+0xafe> addpd xmm0,XMMWORD PTR [rax+rcx*1]
0000000000401643 <main+0xb03> mov rax,rcx
0000000000401646 <main+0xb06> add rax,QWORD PTR [rsp+0x758]
000000000040164e <main+0xb0e> movntpd XMMWORD PTR [rax],xmm0
0000000000401652 <main+0xb12> mov rax,QWORD PTR [rsp+0x750]
000000000040165a <main+0xb1a> addpd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401660 <main+0xb20> mov rax,QWORD PTR [rsp+0x758]
0000000000401668 <main+0xb28> add rax,rcx
000000000040166b <main+0xb2b> test sil,0x7
000000000040166f <main+0xb2f> movntpd XMMWORD PTR [rax+0x10],xmm1
0000000000401674 <main+0xb34> jne 0000000000401608 <main+0xac8>
0000000000401676 <main+0xb36> lea rax,[rcx+0x200]
000000000040167d <main+0xb3d> add esi,0x4
0000000000401680 <main+0xb40> add rcx,0x20
0000000000401684 <main+0xb44> mov rdx,rax
0000000000401687 <main+0xb47> add rax,QWORD PTR [rsp+0x748]
000000000040168f <main+0xb4f> add rdx,QWORD PTR [rsp+0x750]
0000000000401697 <main+0xb57> cmp esi,0x7a1200
000000000040169d <main+0xb5d> prefetchnta BYTE PTR [rdx]
00000000004016a0 <main+0xb60> prefetchnta BYTE PTR [rax]
00000000004016a3 <main+0xb63> jne 000000000040161b <main+0xadb>
 
;; Inner loop for no prefetch and standard stores:
 
0000000000401260 <main+0x840> movapd xmm0,xmm3
0000000000401264 <main+0x844> movapd xmm1,XMMWORD PTR [rax+0x8732f0]
000000000040126c <main+0x84c> mulpd xmm0,XMMWORD PTR [rax+0x8732e0]
0000000000401274 <main+0x854> mulpd xmm1,xmm3
0000000000401278 <main+0x858> addpd xmm0,XMMWORD PTR [rax+0x73aae0]
0000000000401280 <main+0x860> addpd xmm1,XMMWORD PTR [rax+0x73aaf0]
0000000000401288 <main+0x868> movapd XMMWORD PTR [rax+0x6022e0],xmm0
0000000000401290 <main+0x870> movapd XMMWORD PTR [rax+0x6022f0],xmm1
0000000000401298 <main+0x878> add rax,0x20
000000000040129c <main+0x87c> cmp rax,0x138800
00000000004012a2 <main+0x882> jne 0000000000401260 <main+0x840>
 
;; Prefetch every loop:
 
0000000000400ea0 <main+0x790> lea rcx,[rax-0x2]
0000000000400ea4 <main+0x794> movapd xmm0,XMMWORD PTR [rax8+0x8732c0]
0000000000400ead <main+0x79d> prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 <main+0x7a4> prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb <main+0x7ab> add rdx,0x20
0000000000400ebf <main+0x7af> movapd xmm1,XMMWORD PTR [rcx8+0x8732c0]
0000000000400ec8 <main+0x7b8> mulpd xmm0,xmm2
0000000000400ecc <main+0x7bc> mulpd xmm1,xmm2
0000000000400ed0 <main+0x7c0> addpd xmm0,XMMWORD PTR [rax8+0x73aac0]
0000000000400ed9 <main+0x7c9> addpd xmm1,XMMWORD PTR [rcx8+0x73aac0]
0000000000400ee2 <main+0x7d2> movntpd XMMWORD PTR [rcx8+0x6022c0],xmm1
0000000000400eeb <main+0x7db> movntpd XMMWORD PTR [rax8+0x6022c0],xmm0
0000000000400ef4 <main+0x7e4> add rax,0x4
0000000000400ef8 <main+0x7e8> cmp rax,0x27102
0000000000400efe <main+0x7ee> jne 0000000000400ea0 <main+0x790>
 
;; Unrolled 4 times (for Aron):
 
0000000000401270 <main+0x850> movapd xmm0,xmm4
0000000000401274 <main+0x854> lea rax,[rdx+0x200]
000000000040127b <main+0x85b> movapd xmm1,XMMWORD PTR [rdx+0x8013e70]
0000000000401283 <main+0x863> mulpd xmm0,XMMWORD PTR [rdx+0x8013e60]
000000000040128b <main+0x86b> prefetchnta BYTE PTR [rax+0x430ae60]
0000000000401292 <main+0x872> prefetchnta BYTE PTR [rax+0x8013e60]
0000000000401299 <main+0x879> movapd xmm2,XMMWORD PTR [rdx+0x8013e80]
00000000004012a1 <main+0x881> mulpd xmm1,xmm4
00000000004012a5 <main+0x885> movapd xmm3,XMMWORD PTR [rdx+0x8013e90]
00000000004012ad <main+0x88d> mulpd xmm2,xmm4
00000000004012b1 <main+0x891> mulpd xmm3,xmm4
00000000004012b5 <main+0x895> addpd xmm0,XMMWORD PTR [rdx+0x430ae60]
00000000004012bd <main+0x89d> addpd xmm1,XMMWORD PTR [rdx+0x430ae70]
00000000004012c5 <main+0x8a5> addpd xmm2,XMMWORD PTR [rdx+0x430ae80]
00000000004012cd <main+0x8ad> addpd xmm3,XMMWORD PTR [rdx+0x430ae90]
00000000004012d5 <main+0x8b5> movntpd XMMWORD PTR [rdx+0x601e60],xmm0
00000000004012dd <main+0x8bd> movntpd XMMWORD PTR [rdx+0x601e70],xmm1
00000000004012e5 <main+0x8c5> movntpd XMMWORD PTR [rdx+0x601e80],xmm2
00000000004012ed <main+0x8cd> movntpd XMMWORD PTR [rdx+0x601e90],xmm3
00000000004012f5 <main+0x8d5> add rdx,0x40
00000000004012f9 <main+0x8d9> cmp rdx,0x3d09000
0000000000401300 <main+0x8e0> jne 0000000000401270 <main+0x850>

; Prefetch every loop:

0000000000400ea0 lea rcx,[rax-0x2]
0000000000400ea4 movapd xmm0,XMMWORD PTR [rax*8+0x8732c0]
0000000000400ead prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb add rdx,0x20
0000000000400ebf movapd xmm1,XMMWORD PTR [rcx*8+0x8732c0]
0000000000400ec8 mulpd xmm0,xmm2
0000000000400ecc mulpd xmm1,xmm2
0000000000400ed0 addpd xmm0,XMMWORD PTR [rax*8+0x73aac0]
0000000000400ed9 addpd xmm1,XMMWORD PTR [rcx*8+0x73aac0]
0000000000400ee2 movntpd XMMWORD PTR [rcx*8+0x6022c0],xmm1
0000000000400eeb movntpd XMMWORD PTR [rax*8+0x6022c0],xmm0
0000000000400ef4 add rax,0x4
0000000000400ef8 cmp rax,0x27102
0000000000400efe jne 0000000000400ea0

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.