Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Disassembled inner loop from ATLAS sdot
=> 0x00007fffefc226b0 <+0>: movslq %edi,%rax
0x00007fffefc226b3 <+3>: vxorps %xmm0,%xmm0,%xmm0
0x00007fffefc226b7 <+7>: and $0xfffffffc,%edi
0x00007fffefc226ba <+10>: lea (%rsi,%rax,4),%r8
0x00007fffefc226be <+14>: movslq %edi,%rdi
0x00007fffefc226c1 <+17>: lea (%rsi,%rdi,4),%rdi
0x00007fffefc226c5 <+21>: cmp %rdi,%rsi
0x00007fffefc226c8 <+24>: je 0x7fffefc22754 <ATL_sdot_xp1yp1aXbX+164>
0x00007fffefc226ce <+30>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffefc226d2 <+34>: mov %rcx,%rdx
0x00007fffefc226d5 <+37>: mov %rsi,%rax
0x00007fffefc226d8 <+40>: vmovaps %xmm1,%xmm2
0x00007fffefc226dc <+44>: vmovaps %xmm1,%xmm3
0x00007fffefc226e0 <+48>: vmovaps %xmm1,%xmm0
0x00007fffefc226e4 <+52>: nopl 0x0(%rax)
0x00007fffefc226e8 <+56>: prefetchnta 0x140(%rax)
0x00007fffefc226ef <+63>: vmovss (%rax),%xmm4
0x00007fffefc226f3 <+67>: vmulss (%rdx),%xmm4,%xmm4
0x00007fffefc226f7 <+71>: vaddss %xmm4,%xmm1,%xmm1
0x00007fffefc226fb <+75>: vmovss 0x4(%rax),%xmm4
0x00007fffefc22700 <+80>: vmulss 0x4(%rdx),%xmm4,%xmm4
0x00007fffefc22705 <+85>: vaddss %xmm4,%xmm0,%xmm0
0x00007fffefc22709 <+89>: vmovss 0x8(%rax),%xmm4
0x00007fffefc2270e <+94>: vmulss 0x8(%rdx),%xmm4,%xmm4
0x00007fffefc22713 <+99>: vaddss %xmm4,%xmm3,%xmm3
0x00007fffefc22717 <+103>: vmovss 0xc(%rax),%xmm4
0x00007fffefc2271c <+108>: add $0x10,%rax
0x00007fffefc22720 <+112>: vmulss 0xc(%rdx),%xmm4,%xmm4
0x00007fffefc22725 <+117>: add $0x10,%rdx
0x00007fffefc22729 <+121>: vaddss %xmm4,%xmm2,%xmm2
0x00007fffefc2272d <+125>: cmp %rdi,%rax
0x00007fffefc22730 <+128>: jne 0x7fffefc226e8 <ATL_sdot_xp1yp1aXbX+56>
0x00007fffefc22732 <+130>: add $0x10,%rsi
0x00007fffefc22736 <+134>: mov %rdi,%rax
0x00007fffefc22739 <+137>: vaddss %xmm0,%xmm1,%xmm0
0x00007fffefc2273d <+141>: sub %rsi,%rax
0x00007fffefc22740 <+144>: vaddss %xmm2,%xmm3,%xmm2
0x00007fffefc22744 <+148>: mov %rax,%rsi
0x00007fffefc22747 <+151>: vaddss %xmm2,%xmm0,%xmm0
0x00007fffefc2274b <+155>: and $0xfffffffffffffff0,%rsi
0x00007fffefc2274f <+159>: lea 0x10(%rcx,%rsi,1),%rcx
0x00007fffefc22754 <+164>: cmp %rdi,%r8
0x00007fffefc22757 <+167>: je 0x7fffefc22779 <ATL_sdot_xp1yp1aXbX+201>
0x00007fffefc22759 <+169>: mov %rdi,%rax
0x00007fffefc2275c <+172>: nopl 0x0(%rax)
0x00007fffefc22760 <+176>: vmovss (%rax),%xmm1
0x00007fffefc22764 <+180>: add $0x4,%rax
---Type <return> to continue, or q <return> to quit---
0x00007fffefc22768 <+184>: vmulss (%rcx),%xmm1,%xmm1
0x00007fffefc2276c <+188>: add $0x4,%rcx
0x00007fffefc22770 <+192>: vaddss %xmm1,%xmm0,%xmm0
0x00007fffefc22774 <+196>: cmp %rax,%r8
0x00007fffefc22777 <+199>: jne 0x7fffefc22760 <ATL_sdot_xp1yp1aXbX+176>
0x00007fffefc22779 <+201>: repz retq

This comment has been minimized.

Show comment Hide comment

eholk Dec 7, 2012

This is basically the scalar loop unrolled four times.


eholk commented Dec 7, 2012

This is basically the scalar loop unrolled four times.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment