Skip to content

@jedbrown /triad.s
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
;; With prefetch and NT stores:
0000000000401608 <main+0xac8> add esi,0x4
000000000040160b <main+0xacb> add rcx,0x20
000000000040160f <main+0xacf> cmp esi,0x7a1200
0000000000401615 <main+0xad5> je 00000000004016a9 <main+0xb69>
000000000040161b <main+0xadb> mov rax,QWORD PTR [rsp+0x748]
0000000000401623 <main+0xae3> movapd xmm0,xmm3
0000000000401627 <main+0xae7> mulpd xmm0,XMMWORD PTR [rax+rcx*1]
000000000040162c <main+0xaec> movapd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401632 <main+0xaf2> mov rax,QWORD PTR [rsp+0x750]
000000000040163a <main+0xafa> mulpd xmm1,xmm3
000000000040163e <main+0xafe> addpd xmm0,XMMWORD PTR [rax+rcx*1]
0000000000401643 <main+0xb03> mov rax,rcx
0000000000401646 <main+0xb06> add rax,QWORD PTR [rsp+0x758]
000000000040164e <main+0xb0e> movntpd XMMWORD PTR [rax],xmm0
0000000000401652 <main+0xb12> mov rax,QWORD PTR [rsp+0x750]
000000000040165a <main+0xb1a> addpd xmm1,XMMWORD PTR [rax+rcx*1+0x10]
0000000000401660 <main+0xb20> mov rax,QWORD PTR [rsp+0x758]
0000000000401668 <main+0xb28> add rax,rcx
000000000040166b <main+0xb2b> test sil,0x7
000000000040166f <main+0xb2f> movntpd XMMWORD PTR [rax+0x10],xmm1
0000000000401674 <main+0xb34> jne 0000000000401608 <main+0xac8>
0000000000401676 <main+0xb36> lea rax,[rcx+0x200]
000000000040167d <main+0xb3d> add esi,0x4
0000000000401680 <main+0xb40> add rcx,0x20
0000000000401684 <main+0xb44> mov rdx,rax
0000000000401687 <main+0xb47> add rax,QWORD PTR [rsp+0x748]
000000000040168f <main+0xb4f> add rdx,QWORD PTR [rsp+0x750]
0000000000401697 <main+0xb57> cmp esi,0x7a1200
000000000040169d <main+0xb5d> prefetchnta BYTE PTR [rdx]
00000000004016a0 <main+0xb60> prefetchnta BYTE PTR [rax]
00000000004016a3 <main+0xb63> jne 000000000040161b <main+0xadb>
;; Inner loop for no prefetch and standard stores:
0000000000401260 <main+0x840> movapd xmm0,xmm3
0000000000401264 <main+0x844> movapd xmm1,XMMWORD PTR [rax+0x8732f0]
000000000040126c <main+0x84c> mulpd xmm0,XMMWORD PTR [rax+0x8732e0]
0000000000401274 <main+0x854> mulpd xmm1,xmm3
0000000000401278 <main+0x858> addpd xmm0,XMMWORD PTR [rax+0x73aae0]
0000000000401280 <main+0x860> addpd xmm1,XMMWORD PTR [rax+0x73aaf0]
0000000000401288 <main+0x868> movapd XMMWORD PTR [rax+0x6022e0],xmm0
0000000000401290 <main+0x870> movapd XMMWORD PTR [rax+0x6022f0],xmm1
0000000000401298 <main+0x878> add rax,0x20
000000000040129c <main+0x87c> cmp rax,0x138800
00000000004012a2 <main+0x882> jne 0000000000401260 <main+0x840>
;; Prefetch every loop:
0000000000400ea0 <main+0x790> lea rcx,[rax-0x2]
0000000000400ea4 <main+0x794> movapd xmm0,XMMWORD PTR [rax8+0x8732c0]
0000000000400ead <main+0x79d> prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 <main+0x7a4> prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb <main+0x7ab> add rdx,0x20
0000000000400ebf <main+0x7af> movapd xmm1,XMMWORD PTR [rcx8+0x8732c0]
0000000000400ec8 <main+0x7b8> mulpd xmm0,xmm2
0000000000400ecc <main+0x7bc> mulpd xmm1,xmm2
0000000000400ed0 <main+0x7c0> addpd xmm0,XMMWORD PTR [rax8+0x73aac0]
0000000000400ed9 <main+0x7c9> addpd xmm1,XMMWORD PTR [rcx8+0x73aac0]
0000000000400ee2 <main+0x7d2> movntpd XMMWORD PTR [rcx8+0x6022c0],xmm1
0000000000400eeb <main+0x7db> movntpd XMMWORD PTR [rax8+0x6022c0],xmm0
0000000000400ef4 <main+0x7e4> add rax,0x4
0000000000400ef8 <main+0x7e8> cmp rax,0x27102
0000000000400efe <main+0x7ee> jne 0000000000400ea0 <main+0x790>
;; Unrolled 4 times (for Aron):
0000000000401270 <main+0x850> movapd xmm0,xmm4
0000000000401274 <main+0x854> lea rax,[rdx+0x200]
000000000040127b <main+0x85b> movapd xmm1,XMMWORD PTR [rdx+0x8013e70]
0000000000401283 <main+0x863> mulpd xmm0,XMMWORD PTR [rdx+0x8013e60]
000000000040128b <main+0x86b> prefetchnta BYTE PTR [rax+0x430ae60]
0000000000401292 <main+0x872> prefetchnta BYTE PTR [rax+0x8013e60]
0000000000401299 <main+0x879> movapd xmm2,XMMWORD PTR [rdx+0x8013e80]
00000000004012a1 <main+0x881> mulpd xmm1,xmm4
00000000004012a5 <main+0x885> movapd xmm3,XMMWORD PTR [rdx+0x8013e90]
00000000004012ad <main+0x88d> mulpd xmm2,xmm4
00000000004012b1 <main+0x891> mulpd xmm3,xmm4
00000000004012b5 <main+0x895> addpd xmm0,XMMWORD PTR [rdx+0x430ae60]
00000000004012bd <main+0x89d> addpd xmm1,XMMWORD PTR [rdx+0x430ae70]
00000000004012c5 <main+0x8a5> addpd xmm2,XMMWORD PTR [rdx+0x430ae80]
00000000004012cd <main+0x8ad> addpd xmm3,XMMWORD PTR [rdx+0x430ae90]
00000000004012d5 <main+0x8b5> movntpd XMMWORD PTR [rdx+0x601e60],xmm0
00000000004012dd <main+0x8bd> movntpd XMMWORD PTR [rdx+0x601e70],xmm1
00000000004012e5 <main+0x8c5> movntpd XMMWORD PTR [rdx+0x601e80],xmm2
00000000004012ed <main+0x8cd> movntpd XMMWORD PTR [rdx+0x601e90],xmm3
00000000004012f5 <main+0x8d5> add rdx,0x40
00000000004012f9 <main+0x8d9> cmp rdx,0x3d09000
0000000000401300 <main+0x8e0> jne 0000000000401270 <main+0x850>
@jedbrown
Owner

; Prefetch every loop:

0000000000400ea0 lea rcx,[rax-0x2]
0000000000400ea4 movapd xmm0,XMMWORD PTR [rax8+0x8732c0]
0000000000400ead prefetchnta BYTE PTR [rdx+0x73acc0]
0000000000400eb4 prefetchnta BYTE PTR [rdx+0x8734c0]
0000000000400ebb add rdx,0x20
0000000000400ebf movapd xmm1,XMMWORD PTR [rcx
8+0x8732c0]
0000000000400ec8 mulpd xmm0,xmm2
0000000000400ecc mulpd xmm1,xmm2
0000000000400ed0 addpd xmm0,XMMWORD PTR [rax8+0x73aac0]
0000000000400ed9 addpd xmm1,XMMWORD PTR [rcx
8+0x73aac0]
0000000000400ee2 movntpd XMMWORD PTR [rcx8+0x6022c0],xmm1
0000000000400eeb movntpd XMMWORD PTR [rax
8+0x6022c0],xmm0
0000000000400ef4 add rax,0x4
0000000000400ef8 cmp rax,0x27102
0000000000400efe jne 0000000000400ea0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.