-
-
Save travisdowns/b8294098c5082886f4a043ef8b6607bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
These are the ~200 instructions clang uses to vectorize: | |
uint32_t add_indirect_inner(const uint32_t *data, const uint32_t *offsets, size_t len) { | |
assert(len >= 2 && len % 2 == 0); | |
uint32_t sum1 = 0, sum2 = 0; | |
size_t i = len; | |
do { | |
sum1 += data[offsets[i - 1]]; | |
sum2 += data[offsets[i - 2]]; | |
i -= 2; | |
} while (i); | |
return sum1 + sum2; | |
} | |
It's inlined into another function, which is where the memcpy and a few other things come from, but 95% the assembly is | |
from the loop the above. It runs more than 2x slower than the non-vectorized version (which runs at 2 cycles/iteration, limited by the 4 loads | |
per loop). | |
00000000004e84d0 <add_indirect(unsigned long, void*)>: | |
4e84d0: push r14 | |
4e84d2: push rbx | |
4e84d3: sub rsp,0x8208 | |
4e84da: mov r14,rdi | |
4e84dd: lea rbx,[rsp+0x200] | |
4e84e5: mov edx,0x4000 | |
4e84ea: mov rdi,rbx | |
4e84ed: xor esi,esi | |
4e84ef: call 406590 <memset@plt> | |
4e84f4: lea rax,[rsp+0x4200] | |
4e84fc: mov ecx,0x7b | |
4e8501: vpcmpeqd ymm3,ymm3,ymm3 | |
4e8505: vpbroadcastq ymm5,QWORD PTR [rip+0x182a4a] # 66af58 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xa8> | |
4e850e: vpbroadcastq ymm11,QWORD PTR [rip+0x182a49] # 66af60 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xb0> | |
4e8517: vpbroadcastq ymm15,QWORD PTR [rip+0x182a48] # 66af68 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xb8> | |
4e8520: vbroadcastsd ymm0,QWORD PTR [rip+0x182a47] # 66af70 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xc0> | |
4e8529: vmovups YMMWORD PTR [rsp+0x140],ymm0 | |
4e8532: vbroadcastsd ymm0,QWORD PTR [rip+0x182a3d] # 66af78 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xc8> | |
4e853b: vmovups YMMWORD PTR [rsp+0x120],ymm0 | |
4e8544: vbroadcastsd ymm0,QWORD PTR [rip+0x182a33] # 66af80 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xd0> | |
4e854d: vmovups YMMWORD PTR [rsp+0x100],ymm0 | |
4e8556: vpbroadcastq ymm0,QWORD PTR [rip+0x182a29] # 66af88 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xd8> | |
4e855f: vmovdqu YMMWORD PTR [rsp+0xe0],ymm0 | |
4e8568: vpbroadcastq ymm2,QWORD PTR [rip+0x182a1f] # 66af90 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xe0> | |
4e8571: vmovdqu YMMWORD PTR [rsp+0x160],ymm5 | |
4e857a: vmovdqu YMMWORD PTR [rsp+0xa0],ymm11 | |
4e8583: vmovdqu YMMWORD PTR [rsp+0x80],ymm15 | |
4e858c: vmovdqu YMMWORD PTR [rsp+0xc0],ymm2 | |
4e8595: nop WORD PTR cs:[rax+rax*1+0x0] | |
4e859f: nop | |
4e85a0: vpxor xmm0,xmm0,xmm0 | |
4e85a4: vmovdqu YMMWORD PTR [rsp+0x20],ymm0 | |
4e85aa: mov ecx,0x800 | |
4e85af: vxorps xmm4,xmm4,xmm4 | |
4e85b3: vxorps xmm1,xmm1,xmm1 | |
4e85b7: vpxor xmm0,xmm0,xmm0 | |
4e85bb: vmovdqa ymm13,YMMWORD PTR [rip+0x18b2dd] # 6738a0 <typeinfo for void register_vector<PerfTimer>(std::vector<std::shared_ptr<BenchmarkGroup>, std::allocator<std::shared_ptr<BenchmarkGroup> > >&)::{lambda()#14}+0xe0> | |
4e85c3: vmovdqa ymm14,YMMWORD PTR [rip+0x18b2b5] # 673880 <typeinfo for void register_vector<PerfTimer>(std::vector<std::shared_ptr<BenchmarkGroup>, std::allocator<std::shared_ptr<BenchmarkGroup> > >&)::{lambda()#14}+0xc0> | |
4e85cb: nop DWORD PTR [rax+rax*1+0x0] | |
4e85d0: vmovups YMMWORD PTR [rsp+0x1a0],ymm1 | |
4e85d9: vmovdqu YMMWORD PTR [rsp+0x1c0],ymm0 | |
4e85e2: vmovups YMMWORD PTR [rsp+0x1e0],ymm4 | |
4e85eb: vpaddq ymm2,ymm13,ymm3 | |
4e85ef: vpaddq ymm4,ymm14,ymm3 | |
4e85f3: vpaddq ymm3,ymm13,ymm5 | |
4e85f7: vpaddq ymm6,ymm14,ymm5 | |
4e85fb: vpcmpeqd xmm5,xmm5,xmm5 | |
4e85ff: vpcmpeqd xmm7,xmm7,xmm7 | |
4e8603: vpcmpeqd xmm8,xmm8,xmm8 | |
4e8608: vpcmpeqd xmm9,xmm9,xmm9 | |
4e860d: vpaddq ymm10,ymm13,ymm11 | |
4e8612: vpaddq ymm11,ymm14,YMMWORD PTR [rsp+0xa0] | |
4e861b: vpaddq ymm12,ymm13,ymm15 | |
4e8620: vpgatherqd xmm0,DWORD PTR [rbx+ymm4*4],xmm5 | |
4e8626: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 | |
4e862c: vpaddq ymm15,ymm14,YMMWORD PTR [rsp+0x80] | |
4e8635: vpgatherqd xmm5,DWORD PTR [rbx+ymm2*4],xmm7 | |
4e863b: vpcmpeqd xmm7,xmm7,xmm7 | |
4e863f: vpgatherqd xmm4,DWORD PTR [rbx+ymm6*4],xmm8 | |
4e8645: vpcmpeqd xmm0,xmm0,xmm0 | |
4e8649: vpgatherqd xmm8,DWORD PTR [rbx+ymm3*4],xmm9 | |
4e864f: vpcmpeqd xmm2,xmm2,xmm2 | |
4e8653: vpgatherqd xmm9,DWORD PTR [rbx+ymm11*4],xmm7 | |
4e8659: vpcmpeqd xmm7,xmm7,xmm7 | |
4e865d: vpgatherqd xmm11,DWORD PTR [rbx+ymm10*4],xmm0 | |
4e8663: vmovdqu ymm6,YMMWORD PTR [rsp+0x140] | |
4e866c: vpaddq ymm0,ymm13,ymm6 | |
4e8670: vpgatherqd xmm3,DWORD PTR [rbx+ymm15*4],xmm2 | |
4e8676: vpaddq ymm2,ymm14,ymm6 | |
4e867a: vpgatherqd xmm6,DWORD PTR [rbx+ymm12*4],xmm7 | |
4e8680: vpcmpeqd xmm7,xmm7,xmm7 | |
4e8684: vpgatherqd xmm10,DWORD PTR [rbx+ymm2*4],xmm7 | |
4e868a: vpcmpeqd xmm2,xmm2,xmm2 | |
4e868e: vpgatherqd xmm7,DWORD PTR [rbx+ymm0*4],xmm2 | |
4e8694: vpmovzxdq ymm0,xmm5 | |
4e8699: vmovdqu ymm1,YMMWORD PTR [rsp+0x120] | |
4e86a2: vpaddq ymm2,ymm14,ymm1 | |
4e86a6: vpcmpeqd xmm5,xmm5,xmm5 | |
4e86aa: vpmovzxdq ymm12,XMMWORD PTR [rsp+0x10] | |
4e86b1: vpgatherqd xmm15,DWORD PTR [rbx+ymm2*4],xmm5 | |
4e86b7: vpmovzxdq ymm2,xmm4 | |
4e86bc: vpcmpeqd xmm4,xmm4,xmm4 | |
4e86c0: vpgatherqd xmm5,DWORD PTR [rax+ymm12*4],xmm4 | |
4e86c6: vmovdqa XMMWORD PTR [rsp+0x10],xmm5 | |
4e86cc: vpcmpeqd xmm5,xmm5,xmm5 | |
4e86d0: vpgatherqd xmm4,DWORD PTR [rax+ymm0*4],xmm5 | |
4e86d6: vmovdqu YMMWORD PTR [rsp+0x180],ymm4 | |
4e86df: vpcmpeqd xmm0,xmm0,xmm0 | |
4e86e3: vpgatherqd xmm4,DWORD PTR [rax+ymm2*4],xmm0 | |
4e86e9: vmovdqa XMMWORD PTR [rsp+0x70],xmm4 | |
4e86ef: vpmovzxdq ymm0,xmm8 | |
4e86f4: vpcmpeqd xmm2,xmm2,xmm2 | |
4e86f8: vpmovzxdq ymm8,xmm11 | |
4e86fd: vpmovzxdq ymm9,xmm9 | |
4e8702: vpmovzxdq ymm11,xmm3 | |
4e8707: vpgatherqd xmm12,DWORD PTR [rax+ymm0*4],xmm2 | |
4e870d: vpcmpeqd xmm0,xmm0,xmm0 | |
4e8711: vpgatherqd xmm2,DWORD PTR [rax+ymm9*4],xmm0 | |
4e8717: vmovdqa XMMWORD PTR [rsp+0x60],xmm2 | |
4e871d: vpcmpeqd xmm0,xmm0,xmm0 | |
4e8721: vpgatherqd xmm9,DWORD PTR [rax+ymm8*4],xmm0 | |
4e8727: vpcmpeqd xmm2,xmm2,xmm2 | |
4e872b: vpgatherqd xmm0,DWORD PTR [rax+ymm11*4],xmm2 | |
4e8731: vmovdqa XMMWORD PTR [rsp+0x50],xmm0 | |
4e8737: vpaddq ymm2,ymm13,ymm1 | |
4e873b: vpcmpeqd xmm4,xmm4,xmm4 | |
4e873f: vpgatherqd xmm5,DWORD PTR [rbx+ymm2*4],xmm4 | |
4e8745: vpmovzxdq ymm2,xmm6 | |
4e874a: vpcmpeqd xmm4,xmm4,xmm4 | |
4e874e: vpgatherqd xmm11,DWORD PTR [rax+ymm2*4],xmm4 | |
4e8754: vmovdqu ymm0,YMMWORD PTR [rsp+0x100] | |
4e875d: vpaddq ymm2,ymm14,ymm0 | |
4e8761: vpcmpeqd xmm4,xmm4,xmm4 | |
4e8765: vpgatherqd xmm3,DWORD PTR [rbx+ymm2*4],xmm4 | |
4e876b: vpmovzxdq ymm2,xmm10 | |
4e8770: vpcmpeqd xmm4,xmm4,xmm4 | |
4e8774: vpgatherqd xmm10,DWORD PTR [rax+ymm2*4],xmm4 | |
4e877a: vpaddq ymm2,ymm13,ymm0 | |
4e877e: vpcmpeqd xmm4,xmm4,xmm4 | |
4e8782: vpgatherqd xmm6,DWORD PTR [rbx+ymm2*4],xmm4 | |
4e8788: vpmovzxdq ymm2,xmm7 | |
4e878d: vpcmpeqd xmm4,xmm4,xmm4 | |
4e8791: vpgatherqd xmm7,DWORD PTR [rax+ymm2*4],xmm4 | |
4e8797: vmovdqu ymm0,YMMWORD PTR [rsp+0xe0] | |
4e87a0: vpaddq ymm2,ymm14,ymm0 | |
4e87a4: vpcmpeqd xmm4,xmm4,xmm4 | |
4e87a8: vpgatherqd xmm8,DWORD PTR [rbx+ymm2*4],xmm4 | |
4e87ae: vpmovzxdq ymm2,xmm15 | |
4e87b3: vpcmpeqd xmm4,xmm4,xmm4 | |
4e87b7: vpgatherqd xmm15,DWORD PTR [rax+ymm2*4],xmm4 | |
4e87bd: vpaddq ymm2,ymm13,ymm0 | |
4e87c1: vpcmpeqd xmm4,xmm4,xmm4 | |
4e87c5: vpgatherqd xmm0,DWORD PTR [rbx+ymm2*4],xmm4 | |
4e87cb: vpmovzxdq ymm2,xmm5 | |
4e87d0: vpcmpeqd xmm4,xmm4,xmm4 | |
4e87d4: vpgatherqd xmm5,DWORD PTR [rax+ymm2*4],xmm4 | |
4e87da: vmovdqu ymm1,YMMWORD PTR [rsp+0x180] | |
4e87e3: vinserti128 ymm2,ymm1,XMMWORD PTR [rsp+0x10],0x1 | |
4e87eb: vpaddd ymm2,ymm2,YMMWORD PTR [rsp+0x20] | |
4e87f1: vinserti128 ymm4,ymm12,XMMWORD PTR [rsp+0x70],0x1 | |
4e87f9: vpmovzxdq ymm3,xmm3 | |
4e87fe: vpcmpeqd xmm1,xmm1,xmm1 | |
4e8802: vpgatherqd xmm12,DWORD PTR [rax+ymm3*4],xmm1 | |
4e8808: vpaddd ymm1,ymm4,YMMWORD PTR [rsp+0x1e0] | |
4e8811: vinserti128 ymm3,ymm9,XMMWORD PTR [rsp+0x60],0x1 | |
4e8819: vpaddd ymm9,ymm3,YMMWORD PTR [rsp+0x1a0] | |
4e8822: vinserti128 ymm4,ymm11,XMMWORD PTR [rsp+0x50],0x1 | |
4e882a: vmovdqu ymm11,YMMWORD PTR [rsp+0xa0] | |
4e8833: vinserti128 ymm7,ymm7,xmm10,0x1 | |
4e8839: vpcmpeqd xmm3,xmm3,xmm3 | |
4e883d: vpmovzxdq ymm6,xmm6 | |
4e8842: vpgatherqd xmm10,DWORD PTR [rax+ymm6*4],xmm3 | |
4e8848: vpaddd ymm3,ymm4,YMMWORD PTR [rsp+0x1c0] | |
4e8851: vpaddd ymm2,ymm7,ymm2 | |
4e8855: vmovdqu YMMWORD PTR [rsp+0x20],ymm2 | |
4e885b: vinserti128 ymm2,ymm5,xmm15,0x1 | |
4e8861: vmovdqu ymm15,YMMWORD PTR [rsp+0x80] | |
4e886a: vpmovzxdq ymm0,xmm0 | |
4e886f: vpmovzxdq ymm4,xmm8 | |
4e8874: vinserti128 ymm5,ymm10,xmm12,0x1 | |
4e887a: vpcmpeqd xmm6,xmm6,xmm6 | |
4e887e: vpgatherqd xmm7,DWORD PTR [rax+ymm4*4],xmm6 | |
4e8884: vpcmpeqd xmm4,xmm4,xmm4 | |
4e8888: vpgatherqd xmm6,DWORD PTR [rax+ymm0*4],xmm4 | |
4e888e: vpaddd ymm4,ymm2,ymm1 | |
4e8892: vmovdqu ymm2,YMMWORD PTR [rsp+0xc0] | |
4e889b: vpaddd ymm1,ymm5,ymm9 | |
4e88a0: vmovdqu ymm5,YMMWORD PTR [rsp+0x160] | |
4e88a9: vinserti128 ymm0,ymm6,xmm7,0x1 | |
4e88af: vpaddd ymm0,ymm0,ymm3 | |
4e88b3: vpcmpeqd ymm3,ymm3,ymm3 | |
4e88b7: vpaddq ymm13,ymm13,ymm2 | |
4e88bb: vpaddq ymm14,ymm14,ymm2 | |
4e88bf: add rcx,0xffffffffffffffe0 | |
4e88c3: jne 4e85d0 <add_indirect(unsigned long, void*)+0x100> | |
4e88c9: vpaddd ymm0,ymm0,ymm1 | |
4e88cd: vextracti128 xmm1,ymm0,0x1 | |
4e88d3: vpaddd xmm0,xmm0,xmm1 | |
4e88d7: vpshufd xmm1,xmm0,0x4e | |
4e88dc: vpaddd xmm0,xmm0,xmm1 | |
4e88e0: vpshufd xmm1,xmm0,0xe5 | |
4e88e5: vpaddd xmm0,xmm0,xmm1 | |
4e88e9: vmovd ecx,xmm0 | |
4e88ed: vpaddd ymm0,ymm4,YMMWORD PTR [rsp+0x20] | |
4e88f3: vextracti128 xmm1,ymm0,0x1 | |
4e88f9: vpaddd xmm0,xmm0,xmm1 | |
4e88fd: vpshufd xmm1,xmm0,0x4e | |
4e8902: vpaddd xmm0,xmm0,xmm1 | |
4e8906: vpshufd xmm1,xmm0,0xe5 | |
4e890b: vpaddd xmm0,xmm0,xmm1 | |
4e890f: vmovd edx,xmm0 | |
4e8913: add edx,ecx | |
4e8915: dec r14 | |
4e8918: jne 4e85a0 <add_indirect(unsigned long, void*)+0xd0> | |
4e891e: xor eax,eax | |
4e8920: add rsp,0x8208 | |
4e8927: pop rbx | |
4e8928: pop r14 | |
4e892a: vzeroupper | |
4e892d: ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment