Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
These are the ~200 instructions clang uses to vectorize:
uint32_t add_indirect_inner(const uint32_t *data, const uint32_t *offsets, size_t len) {
assert(len >= 2 && len % 2 == 0);
uint32_t sum1 = 0, sum2 = 0;
size_t i = len;
do {
sum1 += data[offsets[i - 1]];
sum2 += data[offsets[i - 2]];
i -= 2;
} while (i);
return sum1 + sum2;
It's inlined into another function, which is where the memcpy and a few other things come from, but 95% the assembly is
from the loop the above. It runs more than 2x slower than the non-vectorized version (which runs at 2 cycles/iteration, limited by the 4 loads
per loop).
00000000004e84d0 <add_indirect(unsigned long, void*)>:
4e84d0: push r14
4e84d2: push rbx
4e84d3: sub rsp,0x8208
4e84da: mov r14,rdi
4e84dd: lea rbx,[rsp+0x200]
4e84e5: mov edx,0x4000
4e84ea: mov rdi,rbx
4e84ed: xor esi,esi
4e84ef: call 406590 <memset@plt>
4e84f4: lea rax,[rsp+0x4200]
4e84fc: mov ecx,0x7b
4e8501: vpcmpeqd ymm3,ymm3,ymm3
4e8505: vpbroadcastq ymm5,QWORD PTR [rip+0x182a4a] # 66af58 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xa8>
4e850e: vpbroadcastq ymm11,QWORD PTR [rip+0x182a49] # 66af60 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xb0>
4e8517: vpbroadcastq ymm15,QWORD PTR [rip+0x182a48] # 66af68 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xb8>
4e8520: vbroadcastsd ymm0,QWORD PTR [rip+0x182a47] # 66af70 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xc0>
4e8529: vmovups YMMWORD PTR [rsp+0x140],ymm0
4e8532: vbroadcastsd ymm0,QWORD PTR [rip+0x182a3d] # 66af78 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xc8>
4e853b: vmovups YMMWORD PTR [rsp+0x120],ymm0
4e8544: vbroadcastsd ymm0,QWORD PTR [rip+0x182a33] # 66af80 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xd0>
4e854d: vmovups YMMWORD PTR [rsp+0x100],ymm0
4e8556: vpbroadcastq ymm0,QWORD PTR [rip+0x182a29] # 66af88 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xd8>
4e855f: vmovdqu YMMWORD PTR [rsp+0xe0],ymm0
4e8568: vpbroadcastq ymm2,QWORD PTR [rip+0x182a1f] # 66af90 <vtable for std::_Sp_counted_ptr_inplace<BenchmarkGroup, std::allocator<BenchmarkGroup>, (__gnu_cxx::_Lock_policy)2>+0xe0>
4e8571: vmovdqu YMMWORD PTR [rsp+0x160],ymm5
4e857a: vmovdqu YMMWORD PTR [rsp+0xa0],ymm11
4e8583: vmovdqu YMMWORD PTR [rsp+0x80],ymm15
4e858c: vmovdqu YMMWORD PTR [rsp+0xc0],ymm2
4e8595: nop WORD PTR cs:[rax+rax*1+0x0]
4e859f: nop
4e85a0: vpxor xmm0,xmm0,xmm0
4e85a4: vmovdqu YMMWORD PTR [rsp+0x20],ymm0
4e85aa: mov ecx,0x800
4e85af: vxorps xmm4,xmm4,xmm4
4e85b3: vxorps xmm1,xmm1,xmm1
4e85b7: vpxor xmm0,xmm0,xmm0
4e85bb: vmovdqa ymm13,YMMWORD PTR [rip+0x18b2dd] # 6738a0 <typeinfo for void register_vector<PerfTimer>(std::vector<std::shared_ptr<BenchmarkGroup>, std::allocator<std::shared_ptr<BenchmarkGroup> > >&)::{lambda()#14}+0xe0>
4e85c3: vmovdqa ymm14,YMMWORD PTR [rip+0x18b2b5] # 673880 <typeinfo for void register_vector<PerfTimer>(std::vector<std::shared_ptr<BenchmarkGroup>, std::allocator<std::shared_ptr<BenchmarkGroup> > >&)::{lambda()#14}+0xc0>
4e85cb: nop DWORD PTR [rax+rax*1+0x0]
4e85d0: vmovups YMMWORD PTR [rsp+0x1a0],ymm1
4e85d9: vmovdqu YMMWORD PTR [rsp+0x1c0],ymm0
4e85e2: vmovups YMMWORD PTR [rsp+0x1e0],ymm4
4e85eb: vpaddq ymm2,ymm13,ymm3
4e85ef: vpaddq ymm4,ymm14,ymm3
4e85f3: vpaddq ymm3,ymm13,ymm5
4e85f7: vpaddq ymm6,ymm14,ymm5
4e85fb: vpcmpeqd xmm5,xmm5,xmm5
4e85ff: vpcmpeqd xmm7,xmm7,xmm7
4e8603: vpcmpeqd xmm8,xmm8,xmm8
4e8608: vpcmpeqd xmm9,xmm9,xmm9
4e860d: vpaddq ymm10,ymm13,ymm11
4e8612: vpaddq ymm11,ymm14,YMMWORD PTR [rsp+0xa0]
4e861b: vpaddq ymm12,ymm13,ymm15
4e8620: vpgatherqd xmm0,DWORD PTR [rbx+ymm4*4],xmm5
4e8626: vmovdqa XMMWORD PTR [rsp+0x10],xmm0
4e862c: vpaddq ymm15,ymm14,YMMWORD PTR [rsp+0x80]
4e8635: vpgatherqd xmm5,DWORD PTR [rbx+ymm2*4],xmm7
4e863b: vpcmpeqd xmm7,xmm7,xmm7
4e863f: vpgatherqd xmm4,DWORD PTR [rbx+ymm6*4],xmm8
4e8645: vpcmpeqd xmm0,xmm0,xmm0
4e8649: vpgatherqd xmm8,DWORD PTR [rbx+ymm3*4],xmm9
4e864f: vpcmpeqd xmm2,xmm2,xmm2
4e8653: vpgatherqd xmm9,DWORD PTR [rbx+ymm11*4],xmm7
4e8659: vpcmpeqd xmm7,xmm7,xmm7
4e865d: vpgatherqd xmm11,DWORD PTR [rbx+ymm10*4],xmm0
4e8663: vmovdqu ymm6,YMMWORD PTR [rsp+0x140]
4e866c: vpaddq ymm0,ymm13,ymm6
4e8670: vpgatherqd xmm3,DWORD PTR [rbx+ymm15*4],xmm2
4e8676: vpaddq ymm2,ymm14,ymm6
4e867a: vpgatherqd xmm6,DWORD PTR [rbx+ymm12*4],xmm7
4e8680: vpcmpeqd xmm7,xmm7,xmm7
4e8684: vpgatherqd xmm10,DWORD PTR [rbx+ymm2*4],xmm7
4e868a: vpcmpeqd xmm2,xmm2,xmm2
4e868e: vpgatherqd xmm7,DWORD PTR [rbx+ymm0*4],xmm2
4e8694: vpmovzxdq ymm0,xmm5
4e8699: vmovdqu ymm1,YMMWORD PTR [rsp+0x120]
4e86a2: vpaddq ymm2,ymm14,ymm1
4e86a6: vpcmpeqd xmm5,xmm5,xmm5
4e86aa: vpmovzxdq ymm12,XMMWORD PTR [rsp+0x10]
4e86b1: vpgatherqd xmm15,DWORD PTR [rbx+ymm2*4],xmm5
4e86b7: vpmovzxdq ymm2,xmm4
4e86bc: vpcmpeqd xmm4,xmm4,xmm4
4e86c0: vpgatherqd xmm5,DWORD PTR [rax+ymm12*4],xmm4
4e86c6: vmovdqa XMMWORD PTR [rsp+0x10],xmm5
4e86cc: vpcmpeqd xmm5,xmm5,xmm5
4e86d0: vpgatherqd xmm4,DWORD PTR [rax+ymm0*4],xmm5
4e86d6: vmovdqu YMMWORD PTR [rsp+0x180],ymm4
4e86df: vpcmpeqd xmm0,xmm0,xmm0
4e86e3: vpgatherqd xmm4,DWORD PTR [rax+ymm2*4],xmm0
4e86e9: vmovdqa XMMWORD PTR [rsp+0x70],xmm4
4e86ef: vpmovzxdq ymm0,xmm8
4e86f4: vpcmpeqd xmm2,xmm2,xmm2
4e86f8: vpmovzxdq ymm8,xmm11
4e86fd: vpmovzxdq ymm9,xmm9
4e8702: vpmovzxdq ymm11,xmm3
4e8707: vpgatherqd xmm12,DWORD PTR [rax+ymm0*4],xmm2
4e870d: vpcmpeqd xmm0,xmm0,xmm0
4e8711: vpgatherqd xmm2,DWORD PTR [rax+ymm9*4],xmm0
4e8717: vmovdqa XMMWORD PTR [rsp+0x60],xmm2
4e871d: vpcmpeqd xmm0,xmm0,xmm0
4e8721: vpgatherqd xmm9,DWORD PTR [rax+ymm8*4],xmm0
4e8727: vpcmpeqd xmm2,xmm2,xmm2
4e872b: vpgatherqd xmm0,DWORD PTR [rax+ymm11*4],xmm2
4e8731: vmovdqa XMMWORD PTR [rsp+0x50],xmm0
4e8737: vpaddq ymm2,ymm13,ymm1
4e873b: vpcmpeqd xmm4,xmm4,xmm4
4e873f: vpgatherqd xmm5,DWORD PTR [rbx+ymm2*4],xmm4
4e8745: vpmovzxdq ymm2,xmm6
4e874a: vpcmpeqd xmm4,xmm4,xmm4
4e874e: vpgatherqd xmm11,DWORD PTR [rax+ymm2*4],xmm4
4e8754: vmovdqu ymm0,YMMWORD PTR [rsp+0x100]
4e875d: vpaddq ymm2,ymm14,ymm0
4e8761: vpcmpeqd xmm4,xmm4,xmm4
4e8765: vpgatherqd xmm3,DWORD PTR [rbx+ymm2*4],xmm4
4e876b: vpmovzxdq ymm2,xmm10
4e8770: vpcmpeqd xmm4,xmm4,xmm4
4e8774: vpgatherqd xmm10,DWORD PTR [rax+ymm2*4],xmm4
4e877a: vpaddq ymm2,ymm13,ymm0
4e877e: vpcmpeqd xmm4,xmm4,xmm4
4e8782: vpgatherqd xmm6,DWORD PTR [rbx+ymm2*4],xmm4
4e8788: vpmovzxdq ymm2,xmm7
4e878d: vpcmpeqd xmm4,xmm4,xmm4
4e8791: vpgatherqd xmm7,DWORD PTR [rax+ymm2*4],xmm4
4e8797: vmovdqu ymm0,YMMWORD PTR [rsp+0xe0]
4e87a0: vpaddq ymm2,ymm14,ymm0
4e87a4: vpcmpeqd xmm4,xmm4,xmm4
4e87a8: vpgatherqd xmm8,DWORD PTR [rbx+ymm2*4],xmm4
4e87ae: vpmovzxdq ymm2,xmm15
4e87b3: vpcmpeqd xmm4,xmm4,xmm4
4e87b7: vpgatherqd xmm15,DWORD PTR [rax+ymm2*4],xmm4
4e87bd: vpaddq ymm2,ymm13,ymm0
4e87c1: vpcmpeqd xmm4,xmm4,xmm4
4e87c5: vpgatherqd xmm0,DWORD PTR [rbx+ymm2*4],xmm4
4e87cb: vpmovzxdq ymm2,xmm5
4e87d0: vpcmpeqd xmm4,xmm4,xmm4
4e87d4: vpgatherqd xmm5,DWORD PTR [rax+ymm2*4],xmm4
4e87da: vmovdqu ymm1,YMMWORD PTR [rsp+0x180]
4e87e3: vinserti128 ymm2,ymm1,XMMWORD PTR [rsp+0x10],0x1
4e87eb: vpaddd ymm2,ymm2,YMMWORD PTR [rsp+0x20]
4e87f1: vinserti128 ymm4,ymm12,XMMWORD PTR [rsp+0x70],0x1
4e87f9: vpmovzxdq ymm3,xmm3
4e87fe: vpcmpeqd xmm1,xmm1,xmm1
4e8802: vpgatherqd xmm12,DWORD PTR [rax+ymm3*4],xmm1
4e8808: vpaddd ymm1,ymm4,YMMWORD PTR [rsp+0x1e0]
4e8811: vinserti128 ymm3,ymm9,XMMWORD PTR [rsp+0x60],0x1
4e8819: vpaddd ymm9,ymm3,YMMWORD PTR [rsp+0x1a0]
4e8822: vinserti128 ymm4,ymm11,XMMWORD PTR [rsp+0x50],0x1
4e882a: vmovdqu ymm11,YMMWORD PTR [rsp+0xa0]
4e8833: vinserti128 ymm7,ymm7,xmm10,0x1
4e8839: vpcmpeqd xmm3,xmm3,xmm3
4e883d: vpmovzxdq ymm6,xmm6
4e8842: vpgatherqd xmm10,DWORD PTR [rax+ymm6*4],xmm3
4e8848: vpaddd ymm3,ymm4,YMMWORD PTR [rsp+0x1c0]
4e8851: vpaddd ymm2,ymm7,ymm2
4e8855: vmovdqu YMMWORD PTR [rsp+0x20],ymm2
4e885b: vinserti128 ymm2,ymm5,xmm15,0x1
4e8861: vmovdqu ymm15,YMMWORD PTR [rsp+0x80]
4e886a: vpmovzxdq ymm0,xmm0
4e886f: vpmovzxdq ymm4,xmm8
4e8874: vinserti128 ymm5,ymm10,xmm12,0x1
4e887a: vpcmpeqd xmm6,xmm6,xmm6
4e887e: vpgatherqd xmm7,DWORD PTR [rax+ymm4*4],xmm6
4e8884: vpcmpeqd xmm4,xmm4,xmm4
4e8888: vpgatherqd xmm6,DWORD PTR [rax+ymm0*4],xmm4
4e888e: vpaddd ymm4,ymm2,ymm1
4e8892: vmovdqu ymm2,YMMWORD PTR [rsp+0xc0]
4e889b: vpaddd ymm1,ymm5,ymm9
4e88a0: vmovdqu ymm5,YMMWORD PTR [rsp+0x160]
4e88a9: vinserti128 ymm0,ymm6,xmm7,0x1
4e88af: vpaddd ymm0,ymm0,ymm3
4e88b3: vpcmpeqd ymm3,ymm3,ymm3
4e88b7: vpaddq ymm13,ymm13,ymm2
4e88bb: vpaddq ymm14,ymm14,ymm2
4e88bf: add rcx,0xffffffffffffffe0
4e88c3: jne 4e85d0 <add_indirect(unsigned long, void*)+0x100>
4e88c9: vpaddd ymm0,ymm0,ymm1
4e88cd: vextracti128 xmm1,ymm0,0x1
4e88d3: vpaddd xmm0,xmm0,xmm1
4e88d7: vpshufd xmm1,xmm0,0x4e
4e88dc: vpaddd xmm0,xmm0,xmm1
4e88e0: vpshufd xmm1,xmm0,0xe5
4e88e5: vpaddd xmm0,xmm0,xmm1
4e88e9: vmovd ecx,xmm0
4e88ed: vpaddd ymm0,ymm4,YMMWORD PTR [rsp+0x20]
4e88f3: vextracti128 xmm1,ymm0,0x1
4e88f9: vpaddd xmm0,xmm0,xmm1
4e88fd: vpshufd xmm1,xmm0,0x4e
4e8902: vpaddd xmm0,xmm0,xmm1
4e8906: vpshufd xmm1,xmm0,0xe5
4e890b: vpaddd xmm0,xmm0,xmm1
4e890f: vmovd edx,xmm0
4e8913: add edx,ecx
4e8915: dec r14
4e8918: jne 4e85a0 <add_indirect(unsigned long, void*)+0xd0>
4e891e: xor eax,eax
4e8920: add rsp,0x8208
4e8927: pop rbx
4e8928: pop r14
4e892a: vzeroupper
4e892d: ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment