Create a gist now

Instantly share code, notes, and snippets.

Disassembled inner loop from ATLAS sdot
=> 0x00007fffefc226b0 <+0>: movslq %edi,%rax
0x00007fffefc226b3 <+3>: vxorps %xmm0,%xmm0,%xmm0
0x00007fffefc226b7 <+7>: and $0xfffffffc,%edi
0x00007fffefc226ba <+10>: lea (%rsi,%rax,4),%r8
0x00007fffefc226be <+14>: movslq %edi,%rdi
0x00007fffefc226c1 <+17>: lea (%rsi,%rdi,4),%rdi
0x00007fffefc226c5 <+21>: cmp %rdi,%rsi
0x00007fffefc226c8 <+24>: je 0x7fffefc22754 <ATL_sdot_xp1yp1aXbX+164>
0x00007fffefc226ce <+30>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffefc226d2 <+34>: mov %rcx,%rdx
0x00007fffefc226d5 <+37>: mov %rsi,%rax
0x00007fffefc226d8 <+40>: vmovaps %xmm1,%xmm2
0x00007fffefc226dc <+44>: vmovaps %xmm1,%xmm3
0x00007fffefc226e0 <+48>: vmovaps %xmm1,%xmm0
0x00007fffefc226e4 <+52>: nopl 0x0(%rax)
0x00007fffefc226e8 <+56>: prefetchnta 0x140(%rax)
0x00007fffefc226ef <+63>: vmovss (%rax),%xmm4
0x00007fffefc226f3 <+67>: vmulss (%rdx),%xmm4,%xmm4
0x00007fffefc226f7 <+71>: vaddss %xmm4,%xmm1,%xmm1
0x00007fffefc226fb <+75>: vmovss 0x4(%rax),%xmm4
0x00007fffefc22700 <+80>: vmulss 0x4(%rdx),%xmm4,%xmm4
0x00007fffefc22705 <+85>: vaddss %xmm4,%xmm0,%xmm0
0x00007fffefc22709 <+89>: vmovss 0x8(%rax),%xmm4
0x00007fffefc2270e <+94>: vmulss 0x8(%rdx),%xmm4,%xmm4
0x00007fffefc22713 <+99>: vaddss %xmm4,%xmm3,%xmm3
0x00007fffefc22717 <+103>: vmovss 0xc(%rax),%xmm4
0x00007fffefc2271c <+108>: add $0x10,%rax
0x00007fffefc22720 <+112>: vmulss 0xc(%rdx),%xmm4,%xmm4
0x00007fffefc22725 <+117>: add $0x10,%rdx
0x00007fffefc22729 <+121>: vaddss %xmm4,%xmm2,%xmm2
0x00007fffefc2272d <+125>: cmp %rdi,%rax
0x00007fffefc22730 <+128>: jne 0x7fffefc226e8 <ATL_sdot_xp1yp1aXbX+56>
0x00007fffefc22732 <+130>: add $0x10,%rsi
0x00007fffefc22736 <+134>: mov %rdi,%rax
0x00007fffefc22739 <+137>: vaddss %xmm0,%xmm1,%xmm0
0x00007fffefc2273d <+141>: sub %rsi,%rax
0x00007fffefc22740 <+144>: vaddss %xmm2,%xmm3,%xmm2
0x00007fffefc22744 <+148>: mov %rax,%rsi
0x00007fffefc22747 <+151>: vaddss %xmm2,%xmm0,%xmm0
0x00007fffefc2274b <+155>: and $0xfffffffffffffff0,%rsi
0x00007fffefc2274f <+159>: lea 0x10(%rcx,%rsi,1),%rcx
0x00007fffefc22754 <+164>: cmp %rdi,%r8
0x00007fffefc22757 <+167>: je 0x7fffefc22779 <ATL_sdot_xp1yp1aXbX+201>
0x00007fffefc22759 <+169>: mov %rdi,%rax
0x00007fffefc2275c <+172>: nopl 0x0(%rax)
0x00007fffefc22760 <+176>: vmovss (%rax),%xmm1
0x00007fffefc22764 <+180>: add $0x4,%rax
---Type <return> to continue, or q <return> to quit---
0x00007fffefc22768 <+184>: vmulss (%rcx),%xmm1,%xmm1
0x00007fffefc2276c <+188>: add $0x4,%rcx
0x00007fffefc22770 <+192>: vaddss %xmm1,%xmm0,%xmm0
0x00007fffefc22774 <+196>: cmp %rax,%r8
0x00007fffefc22777 <+199>: jne 0x7fffefc22760 <ATL_sdot_xp1yp1aXbX+176>
0x00007fffefc22779 <+201>: repz retq

This is basically the scalar loop unrolled four times.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment