Skip to content

Instantly share code, notes, and snippets.

@thecppzoo
Last active February 22, 2024 20:28
Show Gist options
  • Save thecppzoo/ccb4b9cb1b23bb4695c6cea430213c0b to your computer and use it in GitHub Desktop.
Save thecppzoo/ccb4b9cb1b23bb4695c6cea430213c0b to your computer and use it in GitHub Desktop.
Disassembly of __strlen_avx2 from GLIBC

Since the assembler sources of strlen for AVX2 in GLIBC are virtually obfuscated by the macros, here's the actual assembler: (comments are mine)

Dump of assembler code for function __strlen_avx2: (obtained with GDB disassable)

   0x00007ffff7bf27e0 <+0>:	endbr64 
   0x00007ffff7bf27e4 <+4>:	mov    %edi,%eax
   0x00007ffff7bf27e6 <+6>:	mov    %rdi,%rdx
   0x00007ffff7bf27e9 <+9>:	vpxor  %xmm0,%xmm0,%xmm0
   0x00007ffff7bf27ed <+13>:	and    $0xfff,%eax
   0x00007ffff7bf27f2 <+18>:	cmp    $0xfe0,%eax
   0x00007ffff7bf27f7 <+23>:	ja     0x7ffff7bf2930 <__strlen_avx2+336>
   0x00007ffff7bf27fd <+29>:	vpcmpeqb (%rdi),%ymm0,%ymm1
   0x00007ffff7bf2801 <+33>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf2805 <+37>:	test   %eax,%eax
   0x00007ffff7bf2807 <+39>:	je     0x7ffff7bf2860 <__strlen_avx2+128>
   0x00007ffff7bf2809 <+41>:	tzcnt  %eax,%eax
   0x00007ffff7bf280d <+45>:	vzeroupper 
   0x00007ffff7bf2810 <+48>:	ret    
   # Apparently not accessible section beginning vvvv
   0x00007ffff7bf2811 <+49>:	data16 cs nopw 0x0(%rax,%rax,1)
   0x00007ffff7bf281c <+60>:	nopl   0x0(%rax)
   0x00007ffff7bf2820 <+64>:	tzcnt  %eax,%eax
   0x00007ffff7bf2824 <+68>:	sub    %edx,%edi
   0x00007ffff7bf2826 <+70>:	inc    %edi
   0x00007ffff7bf2828 <+72>:	add    %edi,%eax
   0x00007ffff7bf282a <+74>:	vzeroupper 
   0x00007ffff7bf282d <+77>:	ret    
   0x00007ffff7bf282e <+78>:	xchg   %ax,%ax
   0x00007ffff7bf2830 <+80>:	tzcnt  %eax,%eax
   0x00007ffff7bf2834 <+84>:	sub    %edx,%edi
   0x00007ffff7bf2836 <+86>:	add    $0x21,%edi
   0x00007ffff7bf2839 <+89>:	add    %edi,%eax
   0x00007ffff7bf283b <+91>:	vzeroupper 
   0x00007ffff7bf283e <+94>:	ret    
   0x00007ffff7bf283f <+95>:	nop
   0x00007ffff7bf2840 <+96>:	tzcnt  %eax,%eax
   0x00007ffff7bf2844 <+100>:	sub    %edx,%edi
   0x00007ffff7bf2846 <+102>:	add    $0x41,%edi
   0x00007ffff7bf2849 <+105>:	add    %edi,%eax
   0x00007ffff7bf284b <+107>:	vzeroupper 
   0x00007ffff7bf284e <+110>:	ret    
   0x00007ffff7bf284f <+111>:	nop
   0x00007ffff7bf2850 <+112>:	tzcnt  %eax,%eax
   0x00007ffff7bf2854 <+116>:	sub    %edx,%edi
   0x00007ffff7bf2856 <+118>:	add    $0x61,%edi
   0x00007ffff7bf2859 <+121>:	add    %edi,%eax
   0x00007ffff7bf285b <+123>:	vzeroupper 
   0x00007ffff7bf285e <+126>:	ret    
   0x00007ffff7bf285f <+127>:	nop
   # End of apparently not accessible section ^^^
   # Jumped to by +39
   # 
   0x00007ffff7bf2860 <+128>:	or     $0x1f,%rdi
   # Jumped to by some instruction later
   0x00007ffff7bf2864 <+132>:	vpcmpeqb 0x1(%rdi),%ymm0,%ymm1
   0x00007ffff7bf2869 <+137>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf286d <+141>:	test   %eax,%eax
   0x00007ffff7bf286f <+143>:	jne    0x7ffff7bf2820 <__strlen_avx2+64>
   # why this oddity of misaligning by 1?
   0x00007ffff7bf2871 <+145>:	vpcmpeqb 0x21(%rdi),%ymm0,%ymm1
   0x00007ffff7bf2876 <+150>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf287a <+154>:	test   %eax,%eax
   0x00007ffff7bf287c <+156>:	jne    0x7ffff7bf2830 <__strlen_avx2+80>
   0x00007ffff7bf287e <+158>:	vpcmpeqb 0x41(%rdi),%ymm0,%ymm1
   0x00007ffff7bf2883 <+163>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf2887 <+167>:	test   %eax,%eax
   0x00007ffff7bf2889 <+169>:	jne    0x7ffff7bf2840 <__strlen_avx2+96>
   0x00007ffff7bf288b <+171>:	vpcmpeqb 0x61(%rdi),%ymm0,%ymm1
   0x00007ffff7bf2890 <+176>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf2894 <+180>:	test   %eax,%eax
   0x00007ffff7bf2896 <+182>:	jne    0x7ffff7bf2850 <__strlen_avx2+112>
   0x00007ffff7bf2898 <+184>:	inc    %rdi
   0x00007ffff7bf289b <+187>:	or     $0x7f,%rdi
   0x00007ffff7bf289f <+191>:	nop
   0x00007ffff7bf28a0 <+192>:	vmovdqa 0x1(%rdi),%ymm1
   0x00007ffff7bf28a5 <+197>:	vpminub 0x21(%rdi),%ymm1,%ymm2
   0x00007ffff7bf28aa <+202>:	vmovdqa 0x41(%rdi),%ymm3
   0x00007ffff7bf28af <+207>:	vpminub 0x61(%rdi),%ymm3,%ymm4
   0x00007ffff7bf28b4 <+212>:	vpminub %ymm2,%ymm4,%ymm5
   0x00007ffff7bf28b8 <+216>:	vpcmpeqb %ymm5,%ymm0,%ymm5
   0x00007ffff7bf28bc <+220>:	vpmovmskb %ymm5,%ecx
   0x00007ffff7bf28c0 <+224>:	sub    $0xffffffffffffff80,%rdi
   0x00007ffff7bf28c4 <+228>:	test   %ecx,%ecx
   0x00007ffff7bf28c6 <+230>:	je     0x7ffff7bf28a0 <__strlen_avx2+192>
   0x00007ffff7bf28c8 <+232>:	vpcmpeqb %ymm1,%ymm0,%ymm1
   0x00007ffff7bf28cc <+236>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf28d0 <+240>:	sub    %rdx,%rdi
   0x00007ffff7bf28d3 <+243>:	test   %eax,%eax
   0x00007ffff7bf28d5 <+245>:	jne    0x7ffff7bf2910 <__strlen_avx2+304>
   0x00007ffff7bf28d7 <+247>:	vpcmpeqb %ymm2,%ymm0,%ymm2
   0x00007ffff7bf28db <+251>:	vpmovmskb %ymm2,%eax
   0x00007ffff7bf28df <+255>:	test   %eax,%eax
   0x00007ffff7bf28e1 <+257>:	jne    0x7ffff7bf2920 <__strlen_avx2+320>
   0x00007ffff7bf28e3 <+259>:	vpcmpeqb %ymm3,%ymm0,%ymm3
   0x00007ffff7bf28e7 <+263>:	vpmovmskb %ymm3,%eax
   0x00007ffff7bf28eb <+267>:	shl    $0x20,%rcx
   0x00007ffff7bf28ef <+271>:	or     %rcx,%rax
   0x00007ffff7bf28f2 <+274>:	tzcnt  %rax,%rax
   0x00007ffff7bf28f7 <+279>:	sub    $0x3f,%rdi
   0x00007ffff7bf28fb <+283>:	add    %rdi,%rax
   0x00007ffff7bf28fe <+286>:	vzeroupper 
   0x00007ffff7bf2901 <+289>:	ret    
   0x00007ffff7bf2902 <+290>:	data16 cs nopw 0x0(%rax,%rax,1)
   0x00007ffff7bf290d <+301>:	nopl   (%rax)
   0x00007ffff7bf2910 <+304>:	tzcnt  %eax,%eax
   0x00007ffff7bf2914 <+308>:	sub    $0x7f,%rdi
   0x00007ffff7bf2918 <+312>:	add    %rdi,%rax
   0x00007ffff7bf291b <+315>:	vzeroupper 
   0x00007ffff7bf291e <+318>:	ret    
   0x00007ffff7bf291f <+319>:	nop
   0x00007ffff7bf2920 <+320>:	tzcnt  %eax,%eax
   0x00007ffff7bf2924 <+324>:	sub    $0x5f,%rdi
   0x00007ffff7bf2928 <+328>:	add    %rdi,%rax
   0x00007ffff7bf292b <+331>:	vzeroupper 
   0x00007ffff7bf292e <+334>:	ret    
   0x00007ffff7bf292f <+335>:	nop
   0x00007ffff7bf2930 <+336>:	or     $0x1f,%rdi
   0x00007ffff7bf2934 <+340>:	vpcmpeqb -0x1f(%rdi),%ymm0,%ymm1
   0x00007ffff7bf2939 <+345>:	vpmovmskb %ymm1,%eax
   0x00007ffff7bf293d <+349>:	sarx   %edx,%eax,%eax
   0x00007ffff7bf2942 <+354>:	test   %eax,%eax
   0x00007ffff7bf2944 <+356>:	je     0x7ffff7bf2864 <__strlen_avx2+132>
   0x00007ffff7bf294a <+362>:	tzcnt  %eax,%eax
   0x00007ffff7bf294e <+366>:	vzeroupper 
   0x00007ffff7bf2951 <+369>:	ret    
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment