Created
December 7, 2012 05:09
-
-
Save eholk/4230942 to your computer and use it in GitHub Desktop.
Disassembled inner loop from ATLAS sdot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=> 0x00007fffefc226b0 <+0>: movslq %edi,%rax | |
0x00007fffefc226b3 <+3>: vxorps %xmm0,%xmm0,%xmm0 | |
0x00007fffefc226b7 <+7>: and $0xfffffffc,%edi | |
0x00007fffefc226ba <+10>: lea (%rsi,%rax,4),%r8 | |
0x00007fffefc226be <+14>: movslq %edi,%rdi | |
0x00007fffefc226c1 <+17>: lea (%rsi,%rdi,4),%rdi | |
0x00007fffefc226c5 <+21>: cmp %rdi,%rsi | |
0x00007fffefc226c8 <+24>: je 0x7fffefc22754 <ATL_sdot_xp1yp1aXbX+164> | |
0x00007fffefc226ce <+30>: vxorps %xmm1,%xmm1,%xmm1 | |
0x00007fffefc226d2 <+34>: mov %rcx,%rdx | |
0x00007fffefc226d5 <+37>: mov %rsi,%rax | |
0x00007fffefc226d8 <+40>: vmovaps %xmm1,%xmm2 | |
0x00007fffefc226dc <+44>: vmovaps %xmm1,%xmm3 | |
0x00007fffefc226e0 <+48>: vmovaps %xmm1,%xmm0 | |
0x00007fffefc226e4 <+52>: nopl 0x0(%rax) | |
0x00007fffefc226e8 <+56>: prefetchnta 0x140(%rax) | |
0x00007fffefc226ef <+63>: vmovss (%rax),%xmm4 | |
0x00007fffefc226f3 <+67>: vmulss (%rdx),%xmm4,%xmm4 | |
0x00007fffefc226f7 <+71>: vaddss %xmm4,%xmm1,%xmm1 | |
0x00007fffefc226fb <+75>: vmovss 0x4(%rax),%xmm4 | |
0x00007fffefc22700 <+80>: vmulss 0x4(%rdx),%xmm4,%xmm4 | |
0x00007fffefc22705 <+85>: vaddss %xmm4,%xmm0,%xmm0 | |
0x00007fffefc22709 <+89>: vmovss 0x8(%rax),%xmm4 | |
0x00007fffefc2270e <+94>: vmulss 0x8(%rdx),%xmm4,%xmm4 | |
0x00007fffefc22713 <+99>: vaddss %xmm4,%xmm3,%xmm3 | |
0x00007fffefc22717 <+103>: vmovss 0xc(%rax),%xmm4 | |
0x00007fffefc2271c <+108>: add $0x10,%rax | |
0x00007fffefc22720 <+112>: vmulss 0xc(%rdx),%xmm4,%xmm4 | |
0x00007fffefc22725 <+117>: add $0x10,%rdx | |
0x00007fffefc22729 <+121>: vaddss %xmm4,%xmm2,%xmm2 | |
0x00007fffefc2272d <+125>: cmp %rdi,%rax | |
0x00007fffefc22730 <+128>: jne 0x7fffefc226e8 <ATL_sdot_xp1yp1aXbX+56> | |
0x00007fffefc22732 <+130>: add $0x10,%rsi | |
0x00007fffefc22736 <+134>: mov %rdi,%rax | |
0x00007fffefc22739 <+137>: vaddss %xmm0,%xmm1,%xmm0 | |
0x00007fffefc2273d <+141>: sub %rsi,%rax | |
0x00007fffefc22740 <+144>: vaddss %xmm2,%xmm3,%xmm2 | |
0x00007fffefc22744 <+148>: mov %rax,%rsi | |
0x00007fffefc22747 <+151>: vaddss %xmm2,%xmm0,%xmm0 | |
0x00007fffefc2274b <+155>: and $0xfffffffffffffff0,%rsi | |
0x00007fffefc2274f <+159>: lea 0x10(%rcx,%rsi,1),%rcx | |
0x00007fffefc22754 <+164>: cmp %rdi,%r8 | |
0x00007fffefc22757 <+167>: je 0x7fffefc22779 <ATL_sdot_xp1yp1aXbX+201> | |
0x00007fffefc22759 <+169>: mov %rdi,%rax | |
0x00007fffefc2275c <+172>: nopl 0x0(%rax) | |
0x00007fffefc22760 <+176>: vmovss (%rax),%xmm1 | |
0x00007fffefc22764 <+180>: add $0x4,%rax | |
---Type <return> to continue, or q <return> to quit--- | |
0x00007fffefc22768 <+184>: vmulss (%rcx),%xmm1,%xmm1 | |
0x00007fffefc2276c <+188>: add $0x4,%rcx | |
0x00007fffefc22770 <+192>: vaddss %xmm1,%xmm0,%xmm0 | |
0x00007fffefc22774 <+196>: cmp %rax,%r8 | |
0x00007fffefc22777 <+199>: jne 0x7fffefc22760 <ATL_sdot_xp1yp1aXbX+176> | |
0x00007fffefc22779 <+201>: repz retq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is basically the scalar loop unrolled four times.