Skip to content

Instantly share code, notes, and snippets.

@jedbrown
Created July 15, 2010 13:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jedbrown/476942 to your computer and use it in GitHub Desktop.
Save jedbrown/476942 to your computer and use it in GitHub Desktop.
;; With prefetch and NT stores:
0000000000401608 <main+0xac8> add esi,0x4
000000000040160b <main+0xacb> add rcx,0x20
000000000040160f <main+0xacf> cmp esi,0x7a1200
0000000000401615 <main+0xad5> je 00000000004016a9 <main+0xb69>
000000000040161b <main+0xadb> mov rax,QWORD PTR [rsp+0x748]
0000000000401623 <main+0xae3> movapd xmm0,xmm3
0000000000401627 <main+0xae7> mulpd xmm0,XMMWORD PTR [rax+rcx*1]
000000000040162c <main+0xaec> movapd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401632 <main+0xaf2> mov rax,QWORD PTR [rsp+0x750]
000000000040163a <main+0xafa> mulpd xmm1,xmm3
000000000040163e <main+0xafe> addpd xmm0,XMMWORD PTR [rax+rcx*1]
0000000000401643 <main+0xb03> mov rax,rcx
0000000000401646 <main+0xb06> add rax,QWORD PTR [rsp+0x758]
000000000040164e <main+0xb0e> movntpd XMMWORD PTR [rax],xmm0
0000000000401652 <main+0xb12> mov rax,QWORD PTR [rsp+0x750]
000000000040165a <main+0xb1a> addpd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401660 <main+0xb20> mov rax,QWORD PTR [rsp+0x758]
0000000000401668 <main+0xb28> add rax,rcx
000000000040166b <main+0xb2b> test sil,0x7
000000000040166f <main+0xb2f> movntpd XMMWORD PTR [rax+0x10],xmm1
0000000000401674 <main+0xb34> jne 0000000000401608 <main+0xac8>
0000000000401676 <main+0xb36> lea rax,[rcx+0x200]
000000000040167d <main+0xb3d> add esi,0x4
0000000000401680 <main+0xb40> add rcx,0x20
0000000000401684 <main+0xb44> mov rdx,rax
0000000000401687 <main+0xb47> add rax,QWORD PTR [rsp+0x748]
000000000040168f <main+0xb4f> add rdx,QWORD PTR [rsp+0x750]
0000000000401697 <main+0xb57> cmp esi,0x7a1200
000000000040169d <main+0xb5d> prefetchnta BYTE PTR [rdx]
00000000004016a0 <main+0xb60> prefetchnta BYTE PTR [rax]
00000000004016a3 <main+0xb63> jne 000000000040161b <main+0xadb>
;; Inner loop for no prefetch and standard stores:
0000000000401260 <main+0x840> movapd xmm0,xmm3
0000000000401264 <main+0x844> movapd xmm1,XMMWORD PTR [rax+0x8732f0]
000000000040126c <main+0x84c> mulpd xmm0,XMMWORD PTR [rax+0x8732e0]
0000000000401274 <main+0x854> mulpd xmm1,xmm3
0000000000401278 <main+0x858> addpd xmm0,XMMWORD PTR [rax+0x73aae0]
0000000000401280 <main+0x860> addpd xmm1,XMMWORD PTR [rax+0x73aaf0]
0000000000401288 <main+0x868> movapd XMMWORD PTR [rax+0x6022e0],xmm0
0000000000401290 <main+0x870> movapd XMMWORD PTR [rax+0x6022f0],xmm1
0000000000401298 <main+0x878> add rax,0x20
000000000040129c <main+0x87c> cmp rax,0x138800
00000000004012a2 <main+0x882> jne 0000000000401260 <main+0x840>
;; Prefetch every loop:
0000000000400ea0 <main+0x790> lea rcx,[rax-0x2]
0000000000400ea4 <main+0x794> movapd xmm0,XMMWORD PTR [rax8+0x8732c0]
0000000000400ead <main+0x79d> prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 <main+0x7a4> prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb <main+0x7ab> add rdx,0x20
0000000000400ebf <main+0x7af> movapd xmm1,XMMWORD PTR [rcx8+0x8732c0]
0000000000400ec8 <main+0x7b8> mulpd xmm0,xmm2
0000000000400ecc <main+0x7bc> mulpd xmm1,xmm2
0000000000400ed0 <main+0x7c0> addpd xmm0,XMMWORD PTR [rax8+0x73aac0]
0000000000400ed9 <main+0x7c9> addpd xmm1,XMMWORD PTR [rcx8+0x73aac0]
0000000000400ee2 <main+0x7d2> movntpd XMMWORD PTR [rcx8+0x6022c0],xmm1
0000000000400eeb <main+0x7db> movntpd XMMWORD PTR [rax8+0x6022c0],xmm0
0000000000400ef4 <main+0x7e4> add rax,0x4
0000000000400ef8 <main+0x7e8> cmp rax,0x27102
0000000000400efe <main+0x7ee> jne 0000000000400ea0 <main+0x790>
;; Unrolled 4 times (for Aron):
0000000000401270 <main+0x850> movapd xmm0,xmm4
0000000000401274 <main+0x854> lea rax,[rdx+0x200]
000000000040127b <main+0x85b> movapd xmm1,XMMWORD PTR [rdx+0x8013e70]
0000000000401283 <main+0x863> mulpd xmm0,XMMWORD PTR [rdx+0x8013e60]
000000000040128b <main+0x86b> prefetchnta BYTE PTR [rax+0x430ae60]
0000000000401292 <main+0x872> prefetchnta BYTE PTR [rax+0x8013e60]
0000000000401299 <main+0x879> movapd xmm2,XMMWORD PTR [rdx+0x8013e80]
00000000004012a1 <main+0x881> mulpd xmm1,xmm4
00000000004012a5 <main+0x885> movapd xmm3,XMMWORD PTR [rdx+0x8013e90]
00000000004012ad <main+0x88d> mulpd xmm2,xmm4
00000000004012b1 <main+0x891> mulpd xmm3,xmm4
00000000004012b5 <main+0x895> addpd xmm0,XMMWORD PTR [rdx+0x430ae60]
00000000004012bd <main+0x89d> addpd xmm1,XMMWORD PTR [rdx+0x430ae70]
00000000004012c5 <main+0x8a5> addpd xmm2,XMMWORD PTR [rdx+0x430ae80]
00000000004012cd <main+0x8ad> addpd xmm3,XMMWORD PTR [rdx+0x430ae90]
00000000004012d5 <main+0x8b5> movntpd XMMWORD PTR [rdx+0x601e60],xmm0
00000000004012dd <main+0x8bd> movntpd XMMWORD PTR [rdx+0x601e70],xmm1
00000000004012e5 <main+0x8c5> movntpd XMMWORD PTR [rdx+0x601e80],xmm2
00000000004012ed <main+0x8cd> movntpd XMMWORD PTR [rdx+0x601e90],xmm3
00000000004012f5 <main+0x8d5> add rdx,0x40
00000000004012f9 <main+0x8d9> cmp rdx,0x3d09000
0000000000401300 <main+0x8e0> jne 0000000000401270 <main+0x850>
@jedbrown
Copy link
Author

; Prefetch every loop:

0000000000400ea0 <main+0x790> lea rcx,[rax-0x2]
0000000000400ea4 <main+0x794> movapd xmm0,XMMWORD PTR [rax_8+0x8732c0]
0000000000400ead <main+0x79d> prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 <main+0x7a4> prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb <main+0x7ab> add rdx,0x20
0000000000400ebf <main+0x7af> movapd xmm1,XMMWORD PTR [rcx_8+0x8732c0]
0000000000400ec8 <main+0x7b8> mulpd xmm0,xmm2
0000000000400ecc <main+0x7bc> mulpd xmm1,xmm2
0000000000400ed0 <main+0x7c0> addpd xmm0,XMMWORD PTR [rax_8+0x73aac0]
0000000000400ed9 <main+0x7c9> addpd xmm1,XMMWORD PTR [rcx_8+0x73aac0]
0000000000400ee2 <main+0x7d2> movntpd XMMWORD PTR [rcx_8+0x6022c0],xmm1
0000000000400eeb <main+0x7db> movntpd XMMWORD PTR [rax_8+0x6022c0],xmm0
0000000000400ef4 <main+0x7e4> add rax,0x4
0000000000400ef8 <main+0x7e8> cmp rax,0x27102
0000000000400efe <main+0x7ee> jne 0000000000400ea0 <main+0x790>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment