Skip to content

Instantly share code, notes, and snippets.

@eholk
Created December 7, 2012 05:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eholk/4230942 to your computer and use it in GitHub Desktop.
Save eholk/4230942 to your computer and use it in GitHub Desktop.
Disassembled inner loop from ATLAS sdot
=> 0x00007fffefc226b0 <+0>: movslq %edi,%rax
0x00007fffefc226b3 <+3>: vxorps %xmm0,%xmm0,%xmm0
0x00007fffefc226b7 <+7>: and $0xfffffffc,%edi
0x00007fffefc226ba <+10>: lea (%rsi,%rax,4),%r8
0x00007fffefc226be <+14>: movslq %edi,%rdi
0x00007fffefc226c1 <+17>: lea (%rsi,%rdi,4),%rdi
0x00007fffefc226c5 <+21>: cmp %rdi,%rsi
0x00007fffefc226c8 <+24>: je 0x7fffefc22754 <ATL_sdot_xp1yp1aXbX+164>
0x00007fffefc226ce <+30>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffefc226d2 <+34>: mov %rcx,%rdx
0x00007fffefc226d5 <+37>: mov %rsi,%rax
0x00007fffefc226d8 <+40>: vmovaps %xmm1,%xmm2
0x00007fffefc226dc <+44>: vmovaps %xmm1,%xmm3
0x00007fffefc226e0 <+48>: vmovaps %xmm1,%xmm0
0x00007fffefc226e4 <+52>: nopl 0x0(%rax)
0x00007fffefc226e8 <+56>: prefetchnta 0x140(%rax)
0x00007fffefc226ef <+63>: vmovss (%rax),%xmm4
0x00007fffefc226f3 <+67>: vmulss (%rdx),%xmm4,%xmm4
0x00007fffefc226f7 <+71>: vaddss %xmm4,%xmm1,%xmm1
0x00007fffefc226fb <+75>: vmovss 0x4(%rax),%xmm4
0x00007fffefc22700 <+80>: vmulss 0x4(%rdx),%xmm4,%xmm4
0x00007fffefc22705 <+85>: vaddss %xmm4,%xmm0,%xmm0
0x00007fffefc22709 <+89>: vmovss 0x8(%rax),%xmm4
0x00007fffefc2270e <+94>: vmulss 0x8(%rdx),%xmm4,%xmm4
0x00007fffefc22713 <+99>: vaddss %xmm4,%xmm3,%xmm3
0x00007fffefc22717 <+103>: vmovss 0xc(%rax),%xmm4
0x00007fffefc2271c <+108>: add $0x10,%rax
0x00007fffefc22720 <+112>: vmulss 0xc(%rdx),%xmm4,%xmm4
0x00007fffefc22725 <+117>: add $0x10,%rdx
0x00007fffefc22729 <+121>: vaddss %xmm4,%xmm2,%xmm2
0x00007fffefc2272d <+125>: cmp %rdi,%rax
0x00007fffefc22730 <+128>: jne 0x7fffefc226e8 <ATL_sdot_xp1yp1aXbX+56>
0x00007fffefc22732 <+130>: add $0x10,%rsi
0x00007fffefc22736 <+134>: mov %rdi,%rax
0x00007fffefc22739 <+137>: vaddss %xmm0,%xmm1,%xmm0
0x00007fffefc2273d <+141>: sub %rsi,%rax
0x00007fffefc22740 <+144>: vaddss %xmm2,%xmm3,%xmm2
0x00007fffefc22744 <+148>: mov %rax,%rsi
0x00007fffefc22747 <+151>: vaddss %xmm2,%xmm0,%xmm0
0x00007fffefc2274b <+155>: and $0xfffffffffffffff0,%rsi
0x00007fffefc2274f <+159>: lea 0x10(%rcx,%rsi,1),%rcx
0x00007fffefc22754 <+164>: cmp %rdi,%r8
0x00007fffefc22757 <+167>: je 0x7fffefc22779 <ATL_sdot_xp1yp1aXbX+201>
0x00007fffefc22759 <+169>: mov %rdi,%rax
0x00007fffefc2275c <+172>: nopl 0x0(%rax)
0x00007fffefc22760 <+176>: vmovss (%rax),%xmm1
0x00007fffefc22764 <+180>: add $0x4,%rax
---Type <return> to continue, or q <return> to quit---
0x00007fffefc22768 <+184>: vmulss (%rcx),%xmm1,%xmm1
0x00007fffefc2276c <+188>: add $0x4,%rcx
0x00007fffefc22770 <+192>: vaddss %xmm1,%xmm0,%xmm0
0x00007fffefc22774 <+196>: cmp %rax,%r8
0x00007fffefc22777 <+199>: jne 0x7fffefc22760 <ATL_sdot_xp1yp1aXbX+176>
0x00007fffefc22779 <+201>: repz retq
@eholk
Copy link
Author

eholk commented Dec 7, 2012

This is basically the scalar loop unrolled four times.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment