Skip to content

Instantly share code, notes, and snippets.

@assp1r1n3
Last active May 5, 2016 02:06
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save assp1r1n3/3e967028299039de8a053025b5dbba5e to your computer and use it in GitHub Desktop.
Save assp1r1n3/3e967028299039de8a053025b5dbba5e to your computer and use it in GitHub Desktop.
Hand assembly vs. Intrinsics
#include <stdint.h>
#include <stddef.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
uint64_t builtin_popcnt(const uint64_t* buf, size_t len){
uint64_t cnt = 0;
for(size_t i = 0; i < len; ++i){
cnt += __builtin_popcountll(buf[i]);
}
return cnt;
}
int main(int argc, char** argv){
if(argc != 2){
printf("Usage: %s <buffer size in MB>\n", argv[0]);
return -1;
}
uint64_t size = atol(argv[1]) << 20;
uint64_t* buffer = (uint64_t*)malloc((size/8)*sizeof(*buffer));
//spoil copy-on-write memory allocation on *nix
for (size_t i = 0; i < (size / 8); i++) {
buffer[i] = random();
}
uint64_t count = 0;
clock_t tic = clock();
for(size_t i = 0; i < 10000; ++i){
count += builtin_popcnt(buffer, size/8);
}
clock_t toc = clock();
printf("Count: %lu\tElapsed: %f seconds\tSpeed: %f GB/s\n", count, (double)(toc - tic) / CLOCKS_PER_SEC, ((10000.0*size)/(((double)(toc - tic)*1e+9) / CLOCKS_PER_SEC)));
return 0;
}
.file "bench.c"
.text
.p2align 4,,15
.globl builtin_popcnt
.type builtin_popcnt, @function
builtin_popcnt:
.LFB39:
.cfi_startproc
testq %rsi, %rsi
je .L4
leaq -8(,%rsi,8), %rcx
popcntq (%rdi), %rax
leaq (%rdi,%rsi,8), %r8
shrq $3, %rcx
leaq 8(%rdi), %rsi
andl $7, %ecx
cmpq %r8, %rsi
je .L42
testq %rcx, %rcx
je .L3
cmpq $1, %rcx
je .L26
cmpq $2, %rcx
je .L27
cmpq $3, %rcx
je .L28
cmpq $4, %rcx
je .L29
cmpq $5, %rcx
je .L30
cmpq $6, %rcx
je .L31
popcntq (%rsi), %rdx
leaq 16(%rdi), %rsi
addq %rdx, %rax
.L31:
popcntq (%rsi), %rdi
addq $8, %rsi
addq %rdi, %rax
.L30:
popcntq (%rsi), %r9
addq $8, %rsi
addq %r9, %rax
.L29:
popcntq (%rsi), %r10
addq $8, %rsi
addq %r10, %rax
.L28:
popcntq (%rsi), %r11
addq $8, %rsi
addq %r11, %rax
.L27:
popcntq (%rsi), %rcx
addq $8, %rsi
addq %rcx, %rax
.L26:
popcntq (%rsi), %rdx
addq $8, %rsi
addq %rdx, %rax
cmpq %r8, %rsi
je .L43
.L3:
popcntq (%rsi), %rdi
addq %rdi, %rax
addq $64, %rsi
popcntq -56(%rsi), %r9
addq %r9, %rax
popcntq -48(%rsi), %r10
addq %r10, %rax
popcntq -40(%rsi), %r11
popcntq -32(%rsi), %rcx
addq %r11, %rax
popcntq -24(%rsi), %rdx
popcntq -16(%rsi), %rdi
addq %rcx, %rax
popcntq -8(%rsi), %r9
addq %rdx, %rax
addq %rdi, %rax
addq %r9, %rax
cmpq %r8, %rsi
jne .L3
rep ret
.p2align 4,,10
.p2align 3
.L43:
rep ret
.p2align 4,,10
.p2align 3
.L4:
xorl %eax, %eax
.p2align 4,,9
ret
.p2align 4,,10
.p2align 3
.L42:
.p2align 4,,10
rep ret
.cfi_endproc
.LFE39:
.size builtin_popcnt, .-builtin_popcnt
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC0:
.string "Usage: %s <buffer size in MB>\n"
.align 8
.LC4:
.string "Count: %lu\tElapsed: %f seconds\tSpeed: %f GB/s\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB40:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $8, %rsp
.cfi_def_cfa_offset 64
cmpl $2, %edi
jne .L133
movq 8(%rsi), %rdi
movl $10, %edx
xorl %esi, %esi
call strtol
movq %rax, %rbp
salq $20, %rbp
movq %rbp, %r13
shrq $3, %r13
leaq 0(,%r13,8), %r14
movq %r14, %rdi
call malloc
testq %r13, %r13
movq %rax, %rbx
je .L49
movl $1, %r12d
movl $7, %r15d
call random
cmpq %r13, %r12
movq %rax, (%rbx)
je .L49
testq %r15, %r15
je .L109
cmpq $1, %r15
je .L110
cmpq $2, %r15
je .L111
cmpq $3, %r15
je .L112
cmpq $4, %r15
je .L113
cmpq $5, %r15
je .L114
cmpq $6, %r15
je .L115
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L115:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L114:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L113:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L112:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L111:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
.L110:
call random
movq %rax, (%rbx,%r12,8)
addq $1, %r12
cmpq %r13, %r12
je .L49
.L109:
call random
leaq 1(%r12), %r15
movq %rax, (%rbx,%r12,8)
call random
movq %rax, (%rbx,%r15,8)
leaq 2(%r12), %r15
call random
movq %rax, (%rbx,%r15,8)
leaq 3(%r12), %r15
call random
movq %rax, (%rbx,%r15,8)
leaq 4(%r12), %r15
call random
movq %rax, (%rbx,%r15,8)
leaq 5(%r12), %r15
call random
movq %rax, (%rbx,%r15,8)
leaq 6(%r12), %r15
call random
movq %rax, (%rbx,%r15,8)
leaq 7(%r12), %r15
addq $8, %r12
call random
cmpq %r13, %r12
movq %rax, (%rbx,%r15,8)
jne .L109
.L49:
call clock
movl $10000, %r8d
movq %rax, %r12
leaq (%rbx,%r14), %rax
xorl %r14d, %r14d
.p2align 4,,10
.p2align 3
.L48:
testq %r13, %r13
je .L57
leaq 8(%rbx), %rdx
movq %rax, %rdi
popcntq (%rbx), %rsi
subq %rbx, %rdi
subq $8, %rdi
shrq $3, %rdi
andl $7, %edi
cmpq %rax, %rdx
je .L53
testq %rdi, %rdi
je .L54
cmpq $1, %rdi
je .L103
cmpq $2, %rdi
je .L104
cmpq $3, %rdi
je .L105
cmpq $4, %rdi
je .L106
cmpq $5, %rdi
je .L107
cmpq $6, %rdi
je .L108
popcntq 8(%rbx), %rcx
leaq 16(%rbx), %rdx
addq %rcx, %rsi
.L108:
popcntq (%rdx), %r9
addq $8, %rdx
addq %r9, %rsi
.L107:
popcntq (%rdx), %r10
addq $8, %rdx
addq %r10, %rsi
.L106:
popcntq (%rdx), %r11
addq $8, %rdx
addq %r11, %rsi
.L105:
popcntq (%rdx), %r15
addq $8, %rdx
addq %r15, %rsi
.L104:
popcntq (%rdx), %rdi
addq $8, %rdx
addq %rdi, %rsi
.L103:
popcntq (%rdx), %rcx
addq $8, %rdx
addq %rcx, %rsi
cmpq %rax, %rdx
je .L53
.L54:
popcntq (%rdx), %r9
addq %r9, %rsi
addq $64, %rdx
popcntq -56(%rdx), %r10
addq %r10, %rsi
popcntq -48(%rdx), %r11
addq %r11, %rsi
popcntq -40(%rdx), %r15
popcntq -32(%rdx), %rdi
addq %r15, %rsi
popcntq -24(%rdx), %rcx
popcntq -16(%rdx), %r9
addq %rdi, %rsi
popcntq -8(%rdx), %r10
addq %rcx, %rsi
addq %r9, %rsi
addq %r10, %rsi
cmpq %rax, %rdx
jne .L54
.L53:
addq %rsi, %r14
subq $1, %r8
jne .L48
.L52:
call clock
subq %r12, %rax
testq %rbp, %rbp
vcvtsi2sdq %rax, %xmm2, %xmm2
js .L55
vcvtsi2sdq %rbp, %xmm0, %xmm0
.L56:
vmulsd .LC2(%rip), %xmm2, %xmm3
movq %r14, %rdx
movl $.LC4, %esi
vmovsd .LC3(%rip), %xmm5
movl $1, %edi
movl $2, %eax
vmulsd .LC1(%rip), %xmm0, %xmm4
vdivsd %xmm5, %xmm2, %xmm0
vdivsd %xmm5, %xmm3, %xmm6
vdivsd %xmm6, %xmm4, %xmm1
call __printf_chk
xorl %eax, %eax
.L131:
addq $8, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L57:
.cfi_restore_state
xorl %esi, %esi
addq %rsi, %r14
subq $1, %r8
jne .L48
jmp .L52
.L55:
shrq %rbp
vcvtsi2sdq %rbp, %xmm1, %xmm1
vaddsd %xmm1, %xmm1, %xmm0
jmp .L56
.L133:
movq (%rsi), %rdx
movl $1, %edi
movl $.LC0, %esi
xorl %eax, %eax
call __printf_chk
orl $-1, %eax
jmp .L131
.cfi_endproc
.LFE40:
.size main, .-main
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC1:
.long 0
.long 1086556160
.align 8
.LC2:
.long 0
.long 1104006501
.align 8
.LC3:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",@progbits
#include <stdint.h>
#include <stddef.h>
uint64_t builtin_popcnt(const uint64_t* buf, size_t len){
uint64_t cnt = 0;
for(size_t i = 0; i < len; ++i){
cnt += __builtin_popcountll(buf[i]);
}
return cnt;
}
assp1r1n3@pinguino:/tmp$ gcc --std=gnu99 -mpopcnt -O3 -funroll-loops -march=native bench.c -o bench
assp1r1n3@pinguino:/tmp$ ./bench 1
Count: 130000 Elapsed: 0.364957 seconds Speed: 28.731494 GB/s
assp1r1n3@pinguino:/tmp$ ./bench 2
Count: 130000 Elapsed: 0.725931 seconds Speed: 28.889137 GB/s
assp1r1n3@pinguino:/tmp$ ./bench 4
Count: 120000 Elapsed: 1.539693 seconds Speed: 27.241171 GB/s
assp1r1n3@pinguino:/tmp$ ./bench 8
Count: 130000 Elapsed: 3.276062 seconds Speed: 25.605767 GB/s
assp1r1n3@pinguino:/tmp$ gcc --version
gcc (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
assp1r1n3@pinguino:/tmp$ uname -a
Linux pinguino 3.19.0-58-generic #64~14.04.1-Ubuntu SMP Fri Mar 18 19:05:43 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux
assp1r1n3@pinguino:/tmp$ cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 70
model name : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
stepping : 1
microcode : 0xf
cpu MHz : 2494.226
cache size : 6144 KB
physical id : 0
siblings : 1
core id : 0
cpu cores : 1
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc nopl xtopology nonstop_tsc eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm arat pln pts dtherm fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 invpcid xsaveopt
bugs :
bogomips : 4988.45
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
bench: file format elf64-x86-64
Disassembly of section .init:
00000000004004e8 <_init>:
4004e8: 48 83 ec 08 sub $0x8,%rsp
4004ec: 48 8b 05 05 0b 20 00 mov 0x200b05(%rip),%rax # 600ff8 <_DYNAMIC+0x1d0>
4004f3: 48 85 c0 test %rax,%rax
4004f6: 74 05 je 4004fd <_init+0x15>
4004f8: e8 43 00 00 00 callq 400540 <__gmon_start__@plt>
4004fd: 48 83 c4 08 add $0x8,%rsp
400501: c3 retq
Disassembly of section .plt:
0000000000400510 <clock@plt-0x10>:
400510: ff 35 f2 0a 20 00 pushq 0x200af2(%rip) # 601008 <_GLOBAL_OFFSET_TABLE_+0x8>
400516: ff 25 f4 0a 20 00 jmpq *0x200af4(%rip) # 601010 <_GLOBAL_OFFSET_TABLE_+0x10>
40051c: 0f 1f 40 00 nopl 0x0(%rax)
0000000000400520 <clock@plt>:
400520: ff 25 f2 0a 20 00 jmpq *0x200af2(%rip) # 601018 <_GLOBAL_OFFSET_TABLE_+0x18>
400526: 68 00 00 00 00 pushq $0x0
40052b: e9 e0 ff ff ff jmpq 400510 <_init+0x28>
0000000000400530 <__libc_start_main@plt>:
400530: ff 25 ea 0a 20 00 jmpq *0x200aea(%rip) # 601020 <_GLOBAL_OFFSET_TABLE_+0x20>
400536: 68 01 00 00 00 pushq $0x1
40053b: e9 d0 ff ff ff jmpq 400510 <_init+0x28>
0000000000400540 <__gmon_start__@plt>:
400540: ff 25 e2 0a 20 00 jmpq *0x200ae2(%rip) # 601028 <_GLOBAL_OFFSET_TABLE_+0x28>
400546: 68 02 00 00 00 pushq $0x2
40054b: e9 c0 ff ff ff jmpq 400510 <_init+0x28>
0000000000400550 <strtol@plt>:
400550: ff 25 da 0a 20 00 jmpq *0x200ada(%rip) # 601030 <_GLOBAL_OFFSET_TABLE_+0x30>
400556: 68 03 00 00 00 pushq $0x3
40055b: e9 b0 ff ff ff jmpq 400510 <_init+0x28>
0000000000400560 <random@plt>:
400560: ff 25 d2 0a 20 00 jmpq *0x200ad2(%rip) # 601038 <_GLOBAL_OFFSET_TABLE_+0x38>
400566: 68 04 00 00 00 pushq $0x4
40056b: e9 a0 ff ff ff jmpq 400510 <_init+0x28>
0000000000400570 <malloc@plt>:
400570: ff 25 ca 0a 20 00 jmpq *0x200aca(%rip) # 601040 <_GLOBAL_OFFSET_TABLE_+0x40>
400576: 68 05 00 00 00 pushq $0x5
40057b: e9 90 ff ff ff jmpq 400510 <_init+0x28>
0000000000400580 <__printf_chk@plt>:
400580: ff 25 c2 0a 20 00 jmpq *0x200ac2(%rip) # 601048 <_GLOBAL_OFFSET_TABLE_+0x48>
400586: 68 06 00 00 00 pushq $0x6
40058b: e9 80 ff ff ff jmpq 400510 <_init+0x28>
Disassembly of section .text:
0000000000400590 <main>:
400590: 41 57 push %r15
400592: 41 56 push %r14
400594: 41 55 push %r13
400596: 41 54 push %r12
400598: 55 push %rbp
400599: 53 push %rbx
40059a: 48 83 ec 08 sub $0x8,%rsp
40059e: 83 ff 02 cmp $0x2,%edi
4005a1: 0f 85 08 03 00 00 jne 4008af <main+0x31f>
4005a7: 48 8b 7e 08 mov 0x8(%rsi),%rdi
4005ab: ba 0a 00 00 00 mov $0xa,%edx
4005b0: 31 f6 xor %esi,%esi
4005b2: e8 99 ff ff ff callq 400550 <strtol@plt>
4005b7: 48 89 c5 mov %rax,%rbp
4005ba: 48 c1 e5 14 shl $0x14,%rbp
4005be: 49 89 ed mov %rbp,%r13
4005c1: 49 c1 ed 03 shr $0x3,%r13
4005c5: 4e 8d 34 ed 00 00 00 lea 0x0(,%r13,8),%r14
4005cc: 00
4005cd: 4c 89 f7 mov %r14,%rdi
4005d0: e8 9b ff ff ff callq 400570 <malloc@plt>
4005d5: 4d 85 ed test %r13,%r13
4005d8: 48 89 c3 mov %rax,%rbx
4005db: 0f 84 1e 01 00 00 je 4006ff <main+0x16f>
4005e1: 41 bc 01 00 00 00 mov $0x1,%r12d
4005e7: 41 bf 07 00 00 00 mov $0x7,%r15d
4005ed: e8 6e ff ff ff callq 400560 <random@plt>
4005f2: 4d 39 ec cmp %r13,%r12
4005f5: 48 89 03 mov %rax,(%rbx)
4005f8: 0f 84 01 01 00 00 je 4006ff <main+0x16f>
4005fe: 4d 85 ff test %r15,%r15
400601: 0f 84 84 00 00 00 je 40068b <main+0xfb>
400607: 49 83 ff 01 cmp $0x1,%r15
40060b: 74 6c je 400679 <main+0xe9>
40060d: 49 83 ff 02 cmp $0x2,%r15
400611: 74 59 je 40066c <main+0xdc>
400613: 49 83 ff 03 cmp $0x3,%r15
400617: 74 46 je 40065f <main+0xcf>
400619: 49 83 ff 04 cmp $0x4,%r15
40061d: 74 33 je 400652 <main+0xc2>
40061f: 49 83 ff 05 cmp $0x5,%r15
400623: 74 20 je 400645 <main+0xb5>
400625: 49 83 ff 06 cmp $0x6,%r15
400629: 74 0d je 400638 <main+0xa8>
40062b: e8 30 ff ff ff callq 400560 <random@plt>
400630: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400634: 49 83 c4 01 add $0x1,%r12
400638: e8 23 ff ff ff callq 400560 <random@plt>
40063d: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400641: 49 83 c4 01 add $0x1,%r12
400645: e8 16 ff ff ff callq 400560 <random@plt>
40064a: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
40064e: 49 83 c4 01 add $0x1,%r12
400652: e8 09 ff ff ff callq 400560 <random@plt>
400657: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
40065b: 49 83 c4 01 add $0x1,%r12
40065f: e8 fc fe ff ff callq 400560 <random@plt>
400664: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400668: 49 83 c4 01 add $0x1,%r12
40066c: e8 ef fe ff ff callq 400560 <random@plt>
400671: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400675: 49 83 c4 01 add $0x1,%r12
400679: e8 e2 fe ff ff callq 400560 <random@plt>
40067e: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400682: 49 83 c4 01 add $0x1,%r12
400686: 4d 39 ec cmp %r13,%r12
400689: 74 74 je 4006ff <main+0x16f>
40068b: e8 d0 fe ff ff callq 400560 <random@plt>
400690: 4d 8d 7c 24 01 lea 0x1(%r12),%r15
400695: 4a 89 04 e3 mov %rax,(%rbx,%r12,8)
400699: e8 c2 fe ff ff callq 400560 <random@plt>
40069e: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006a2: 4d 8d 7c 24 02 lea 0x2(%r12),%r15
4006a7: e8 b4 fe ff ff callq 400560 <random@plt>
4006ac: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006b0: 4d 8d 7c 24 03 lea 0x3(%r12),%r15
4006b5: e8 a6 fe ff ff callq 400560 <random@plt>
4006ba: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006be: 4d 8d 7c 24 04 lea 0x4(%r12),%r15
4006c3: e8 98 fe ff ff callq 400560 <random@plt>
4006c8: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006cc: 4d 8d 7c 24 05 lea 0x5(%r12),%r15
4006d1: e8 8a fe ff ff callq 400560 <random@plt>
4006d6: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006da: 4d 8d 7c 24 06 lea 0x6(%r12),%r15
4006df: e8 7c fe ff ff callq 400560 <random@plt>
4006e4: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006e8: 4d 8d 7c 24 07 lea 0x7(%r12),%r15
4006ed: 49 83 c4 08 add $0x8,%r12
4006f1: e8 6a fe ff ff callq 400560 <random@plt>
4006f6: 4d 39 ec cmp %r13,%r12
4006f9: 4a 89 04 fb mov %rax,(%rbx,%r15,8)
4006fd: 75 8c jne 40068b <main+0xfb>
4006ff: e8 1c fe ff ff callq 400520 <clock@plt>
400704: 41 b8 10 27 00 00 mov $0x2710,%r8d
40070a: 49 89 c4 mov %rax,%r12
40070d: 4a 8d 04 33 lea (%rbx,%r14,1),%rax
400711: 45 31 f6 xor %r14d,%r14d
400714: 0f 1f 40 00 nopl 0x0(%rax)
400718: 4d 85 ed test %r13,%r13
40071b: 0f 84 6f 01 00 00 je 400890 <main+0x300>
400721: 48 8d 53 08 lea 0x8(%rbx),%rdx
400725: 48 89 c7 mov %rax,%rdi
400728: f3 48 0f b8 33 popcnt (%rbx),%rsi
40072d: 48 29 df sub %rbx,%rdi
400730: 48 83 ef 08 sub $0x8,%rdi
400734: 48 c1 ef 03 shr $0x3,%rdi
400738: 83 e7 07 and $0x7,%edi
40073b: 48 39 c2 cmp %rax,%rdx
40073e: 0f 84 d3 00 00 00 je 400817 <main+0x287>
400744: 48 85 ff test %rdi,%rdi
400747: 74 7e je 4007c7 <main+0x237>
400749: 48 83 ff 01 cmp $0x1,%rdi
40074d: 74 67 je 4007b6 <main+0x226>
40074f: 48 83 ff 02 cmp $0x2,%rdi
400753: 74 55 je 4007aa <main+0x21a>
400755: 48 83 ff 03 cmp $0x3,%rdi
400759: 74 43 je 40079e <main+0x20e>
40075b: 48 83 ff 04 cmp $0x4,%rdi
40075f: 74 31 je 400792 <main+0x202>
400761: 48 83 ff 05 cmp $0x5,%rdi
400765: 74 1f je 400786 <main+0x1f6>
400767: 48 83 ff 06 cmp $0x6,%rdi
40076b: 74 0d je 40077a <main+0x1ea>
40076d: f3 48 0f b8 4b 08 popcnt 0x8(%rbx),%rcx
400773: 48 8d 53 10 lea 0x10(%rbx),%rdx
400777: 48 01 ce add %rcx,%rsi
40077a: f3 4c 0f b8 0a popcnt (%rdx),%r9
40077f: 48 83 c2 08 add $0x8,%rdx
400783: 4c 01 ce add %r9,%rsi
400786: f3 4c 0f b8 12 popcnt (%rdx),%r10
40078b: 48 83 c2 08 add $0x8,%rdx
40078f: 4c 01 d6 add %r10,%rsi
400792: f3 4c 0f b8 1a popcnt (%rdx),%r11
400797: 48 83 c2 08 add $0x8,%rdx
40079b: 4c 01 de add %r11,%rsi
40079e: f3 4c 0f b8 3a popcnt (%rdx),%r15
4007a3: 48 83 c2 08 add $0x8,%rdx
4007a7: 4c 01 fe add %r15,%rsi
4007aa: f3 48 0f b8 3a popcnt (%rdx),%rdi
4007af: 48 83 c2 08 add $0x8,%rdx
4007b3: 48 01 fe add %rdi,%rsi
4007b6: f3 48 0f b8 0a popcnt (%rdx),%rcx
4007bb: 48 83 c2 08 add $0x8,%rdx
4007bf: 48 01 ce add %rcx,%rsi
4007c2: 48 39 c2 cmp %rax,%rdx
4007c5: 74 50 je 400817 <main+0x287>
4007c7: f3 4c 0f b8 0a popcnt (%rdx),%r9
4007cc: 4c 01 ce add %r9,%rsi
4007cf: 48 83 c2 40 add $0x40,%rdx
4007d3: f3 4c 0f b8 52 c8 popcnt -0x38(%rdx),%r10
4007d9: 4c 01 d6 add %r10,%rsi
4007dc: f3 4c 0f b8 5a d0 popcnt -0x30(%rdx),%r11
4007e2: 4c 01 de add %r11,%rsi
4007e5: f3 4c 0f b8 7a d8 popcnt -0x28(%rdx),%r15
4007eb: f3 48 0f b8 7a e0 popcnt -0x20(%rdx),%rdi
4007f1: 4c 01 fe add %r15,%rsi
4007f4: f3 48 0f b8 4a e8 popcnt -0x18(%rdx),%rcx
4007fa: f3 4c 0f b8 4a f0 popcnt -0x10(%rdx),%r9
400800: 48 01 fe add %rdi,%rsi
400803: f3 4c 0f b8 52 f8 popcnt -0x8(%rdx),%r10
400809: 48 01 ce add %rcx,%rsi
40080c: 4c 01 ce add %r9,%rsi
40080f: 4c 01 d6 add %r10,%rsi
400812: 48 39 c2 cmp %rax,%rdx
400815: 75 b0 jne 4007c7 <main+0x237>
400817: 49 01 f6 add %rsi,%r14
40081a: 49 83 e8 01 sub $0x1,%r8
40081e: 0f 85 f4 fe ff ff jne 400718 <main+0x188>
400824: e8 f7 fc ff ff callq 400520 <clock@plt>
400829: 4c 29 e0 sub %r12,%rax
40082c: 48 85 ed test %rbp,%rbp
40082f: c4 e1 eb 2a d0 vcvtsi2sd %rax,%xmm2,%xmm2
400834: 78 6b js 4008a1 <main+0x311>
400836: c4 e1 fb 2a c5 vcvtsi2sd %rbp,%xmm0,%xmm0
40083b: c5 eb 59 1d 8d 03 00 vmulsd 0x38d(%rip),%xmm2,%xmm3 # 400bd0 <_IO_stdin_used+0x60>
400842: 00
400843: 4c 89 f2 mov %r14,%rdx
400846: be 98 0b 40 00 mov $0x400b98,%esi
40084b: c5 fb 10 2d 85 03 00 vmovsd 0x385(%rip),%xmm5 # 400bd8 <_IO_stdin_used+0x68>
400852: 00
400853: bf 01 00 00 00 mov $0x1,%edi
400858: b8 02 00 00 00 mov $0x2,%eax
40085d: c5 fb 59 25 63 03 00 vmulsd 0x363(%rip),%xmm0,%xmm4 # 400bc8 <_IO_stdin_used+0x58>
400864: 00
400865: c5 eb 5e c5 vdivsd %xmm5,%xmm2,%xmm0
400869: c5 e3 5e f5 vdivsd %xmm5,%xmm3,%xmm6
40086d: c5 db 5e ce vdivsd %xmm6,%xmm4,%xmm1
400871: e8 0a fd ff ff callq 400580 <__printf_chk@plt>
400876: 31 c0 xor %eax,%eax
400878: 48 83 c4 08 add $0x8,%rsp
40087c: 5b pop %rbx
40087d: 5d pop %rbp
40087e: 41 5c pop %r12
400880: 41 5d pop %r13
400882: 41 5e pop %r14
400884: 41 5f pop %r15
400886: c3 retq
400887: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
40088e: 00 00
400890: 31 f6 xor %esi,%esi
400892: 49 01 f6 add %rsi,%r14
400895: 49 83 e8 01 sub $0x1,%r8
400899: 0f 85 79 fe ff ff jne 400718 <main+0x188>
40089f: eb 83 jmp 400824 <main+0x294>
4008a1: 48 d1 ed shr %rbp
4008a4: c4 e1 f3 2a cd vcvtsi2sd %rbp,%xmm1,%xmm1
4008a9: c5 f3 58 c1 vaddsd %xmm1,%xmm1,%xmm0
4008ad: eb 8c jmp 40083b <main+0x2ab>
4008af: 48 8b 16 mov (%rsi),%rdx
4008b2: bf 01 00 00 00 mov $0x1,%edi
4008b7: be 78 0b 40 00 mov $0x400b78,%esi
4008bc: 31 c0 xor %eax,%eax
4008be: e8 bd fc ff ff callq 400580 <__printf_chk@plt>
4008c3: 83 c8 ff or $0xffffffff,%eax
4008c6: eb b0 jmp 400878 <main+0x2e8>
00000000004008c8 <_start>:
4008c8: 31 ed xor %ebp,%ebp
4008ca: 49 89 d1 mov %rdx,%r9
4008cd: 5e pop %rsi
4008ce: 48 89 e2 mov %rsp,%rdx
4008d1: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
4008d5: 50 push %rax
4008d6: 54 push %rsp
4008d7: 49 c7 c0 60 0b 40 00 mov $0x400b60,%r8
4008de: 48 c7 c1 f0 0a 40 00 mov $0x400af0,%rcx
4008e5: 48 c7 c7 90 05 40 00 mov $0x400590,%rdi
4008ec: e8 3f fc ff ff callq 400530 <__libc_start_main@plt>
4008f1: f4 hlt
4008f2: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4008f9: 00 00 00
4008fc: 0f 1f 40 00 nopl 0x0(%rax)
0000000000400900 <deregister_tm_clones>:
400900: b8 67 10 60 00 mov $0x601067,%eax
400905: 55 push %rbp
400906: 48 2d 60 10 60 00 sub $0x601060,%rax
40090c: 48 83 f8 0e cmp $0xe,%rax
400910: 48 89 e5 mov %rsp,%rbp
400913: 77 02 ja 400917 <deregister_tm_clones+0x17>
400915: 5d pop %rbp
400916: c3 retq
400917: b8 00 00 00 00 mov $0x0,%eax
40091c: 48 85 c0 test %rax,%rax
40091f: 74 f4 je 400915 <deregister_tm_clones+0x15>
400921: 5d pop %rbp
400922: bf 60 10 60 00 mov $0x601060,%edi
400927: ff e0 jmpq *%rax
400929: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
0000000000400930 <register_tm_clones>:
400930: b8 60 10 60 00 mov $0x601060,%eax
400935: 55 push %rbp
400936: 48 2d 60 10 60 00 sub $0x601060,%rax
40093c: 48 c1 f8 03 sar $0x3,%rax
400940: 48 89 e5 mov %rsp,%rbp
400943: 48 89 c2 mov %rax,%rdx
400946: 48 c1 ea 3f shr $0x3f,%rdx
40094a: 48 01 d0 add %rdx,%rax
40094d: 48 d1 f8 sar %rax
400950: 75 02 jne 400954 <register_tm_clones+0x24>
400952: 5d pop %rbp
400953: c3 retq
400954: ba 00 00 00 00 mov $0x0,%edx
400959: 48 85 d2 test %rdx,%rdx
40095c: 74 f4 je 400952 <register_tm_clones+0x22>
40095e: 5d pop %rbp
40095f: 48 89 c6 mov %rax,%rsi
400962: bf 60 10 60 00 mov $0x601060,%edi
400967: ff e2 jmpq *%rdx
400969: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
0000000000400970 <__do_global_dtors_aux>:
400970: 80 3d e9 06 20 00 00 cmpb $0x0,0x2006e9(%rip) # 601060 <__TMC_END__>
400977: 75 11 jne 40098a <__do_global_dtors_aux+0x1a>
400979: 55 push %rbp
40097a: 48 89 e5 mov %rsp,%rbp
40097d: e8 7e ff ff ff callq 400900 <deregister_tm_clones>
400982: 5d pop %rbp
400983: c6 05 d6 06 20 00 01 movb $0x1,0x2006d6(%rip) # 601060 <__TMC_END__>
40098a: f3 c3 repz retq
40098c: 0f 1f 40 00 nopl 0x0(%rax)
0000000000400990 <frame_dummy>:
400990: 48 83 3d 88 04 20 00 cmpq $0x0,0x200488(%rip) # 600e20 <__JCR_END__>
400997: 00
400998: 74 1e je 4009b8 <frame_dummy+0x28>
40099a: b8 00 00 00 00 mov $0x0,%eax
40099f: 48 85 c0 test %rax,%rax
4009a2: 74 14 je 4009b8 <frame_dummy+0x28>
4009a4: 55 push %rbp
4009a5: bf 20 0e 60 00 mov $0x600e20,%edi
4009aa: 48 89 e5 mov %rsp,%rbp
4009ad: ff d0 callq *%rax
4009af: 5d pop %rbp
4009b0: e9 7b ff ff ff jmpq 400930 <register_tm_clones>
4009b5: 0f 1f 00 nopl (%rax)
4009b8: e9 73 ff ff ff jmpq 400930 <register_tm_clones>
4009bd: 0f 1f 00 nopl (%rax)
00000000004009c0 <builtin_popcnt>:
4009c0: 48 85 f6 test %rsi,%rsi
4009c3: 0f 84 07 01 00 00 je 400ad0 <builtin_popcnt+0x110>
4009c9: 48 8d 0c f5 f8 ff ff lea -0x8(,%rsi,8),%rcx
4009d0: ff
4009d1: f3 48 0f b8 07 popcnt (%rdi),%rax
4009d6: 4c 8d 04 f7 lea (%rdi,%rsi,8),%r8
4009da: 48 c1 e9 03 shr $0x3,%rcx
4009de: 48 8d 77 08 lea 0x8(%rdi),%rsi
4009e2: 83 e1 07 and $0x7,%ecx
4009e5: 4c 39 c6 cmp %r8,%rsi
4009e8: 0f 84 ea 00 00 00 je 400ad8 <builtin_popcnt+0x118>
4009ee: 48 85 c9 test %rcx,%rcx
4009f1: 74 7d je 400a70 <builtin_popcnt+0xb0>
4009f3: 48 83 f9 01 cmp $0x1,%rcx
4009f7: 74 66 je 400a5f <builtin_popcnt+0x9f>
4009f9: 48 83 f9 02 cmp $0x2,%rcx
4009fd: 74 54 je 400a53 <builtin_popcnt+0x93>
4009ff: 48 83 f9 03 cmp $0x3,%rcx
400a03: 74 42 je 400a47 <builtin_popcnt+0x87>
400a05: 48 83 f9 04 cmp $0x4,%rcx
400a09: 74 30 je 400a3b <builtin_popcnt+0x7b>
400a0b: 48 83 f9 05 cmp $0x5,%rcx
400a0f: 74 1e je 400a2f <builtin_popcnt+0x6f>
400a11: 48 83 f9 06 cmp $0x6,%rcx
400a15: 74 0c je 400a23 <builtin_popcnt+0x63>
400a17: f3 48 0f b8 16 popcnt (%rsi),%rdx
400a1c: 48 8d 77 10 lea 0x10(%rdi),%rsi
400a20: 48 01 d0 add %rdx,%rax
400a23: f3 48 0f b8 3e popcnt (%rsi),%rdi
400a28: 48 83 c6 08 add $0x8,%rsi
400a2c: 48 01 f8 add %rdi,%rax
400a2f: f3 4c 0f b8 0e popcnt (%rsi),%r9
400a34: 48 83 c6 08 add $0x8,%rsi
400a38: 4c 01 c8 add %r9,%rax
400a3b: f3 4c 0f b8 16 popcnt (%rsi),%r10
400a40: 48 83 c6 08 add $0x8,%rsi
400a44: 4c 01 d0 add %r10,%rax
400a47: f3 4c 0f b8 1e popcnt (%rsi),%r11
400a4c: 48 83 c6 08 add $0x8,%rsi
400a50: 4c 01 d8 add %r11,%rax
400a53: f3 48 0f b8 0e popcnt (%rsi),%rcx
400a58: 48 83 c6 08 add $0x8,%rsi
400a5c: 48 01 c8 add %rcx,%rax
400a5f: f3 48 0f b8 16 popcnt (%rsi),%rdx
400a64: 48 83 c6 08 add $0x8,%rsi
400a68: 48 01 d0 add %rdx,%rax
400a6b: 4c 39 c6 cmp %r8,%rsi
400a6e: 74 58 je 400ac8 <builtin_popcnt+0x108>
400a70: f3 48 0f b8 3e popcnt (%rsi),%rdi
400a75: 48 01 f8 add %rdi,%rax
400a78: 48 83 c6 40 add $0x40,%rsi
400a7c: f3 4c 0f b8 4e c8 popcnt -0x38(%rsi),%r9
400a82: 4c 01 c8 add %r9,%rax
400a85: f3 4c 0f b8 56 d0 popcnt -0x30(%rsi),%r10
400a8b: 4c 01 d0 add %r10,%rax
400a8e: f3 4c 0f b8 5e d8 popcnt -0x28(%rsi),%r11
400a94: f3 48 0f b8 4e e0 popcnt -0x20(%rsi),%rcx
400a9a: 4c 01 d8 add %r11,%rax
400a9d: f3 48 0f b8 56 e8 popcnt -0x18(%rsi),%rdx
400aa3: f3 48 0f b8 7e f0 popcnt -0x10(%rsi),%rdi
400aa9: 48 01 c8 add %rcx,%rax
400aac: f3 4c 0f b8 4e f8 popcnt -0x8(%rsi),%r9
400ab2: 48 01 d0 add %rdx,%rax
400ab5: 48 01 f8 add %rdi,%rax
400ab8: 4c 01 c8 add %r9,%rax
400abb: 4c 39 c6 cmp %r8,%rsi
400abe: 75 b0 jne 400a70 <builtin_popcnt+0xb0>
400ac0: f3 c3 repz retq
400ac2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
400ac8: f3 c3 repz retq
400aca: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
400ad0: 31 c0 xor %eax,%eax
400ad2: c3 retq
400ad3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
400ad8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
400adf: 00
400ae0: f3 c3 repz retq
400ae2: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
400ae9: 00 00 00
400aec: 0f 1f 40 00 nopl 0x0(%rax)
0000000000400af0 <__libc_csu_init>:
400af0: 41 57 push %r15
400af2: 41 89 ff mov %edi,%r15d
400af5: 41 56 push %r14
400af7: 49 89 f6 mov %rsi,%r14
400afa: 41 55 push %r13
400afc: 49 89 d5 mov %rdx,%r13
400aff: 41 54 push %r12
400b01: 4c 8d 25 08 03 20 00 lea 0x200308(%rip),%r12 # 600e10 <__frame_dummy_init_array_entry>
400b08: 55 push %rbp
400b09: 48 8d 2d 08 03 20 00 lea 0x200308(%rip),%rbp # 600e18 <__init_array_end>
400b10: 53 push %rbx
400b11: 4c 29 e5 sub %r12,%rbp
400b14: 31 db xor %ebx,%ebx
400b16: 48 c1 fd 03 sar $0x3,%rbp
400b1a: 48 83 ec 08 sub $0x8,%rsp
400b1e: e8 c5 f9 ff ff callq 4004e8 <_init>
400b23: 48 85 ed test %rbp,%rbp
400b26: 74 1e je 400b46 <__libc_csu_init+0x56>
400b28: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
400b2f: 00
400b30: 4c 89 ea mov %r13,%rdx
400b33: 4c 89 f6 mov %r14,%rsi
400b36: 44 89 ff mov %r15d,%edi
400b39: 41 ff 14 dc callq *(%r12,%rbx,8)
400b3d: 48 83 c3 01 add $0x1,%rbx
400b41: 48 39 eb cmp %rbp,%rbx
400b44: 75 ea jne 400b30 <__libc_csu_init+0x40>
400b46: 48 83 c4 08 add $0x8,%rsp
400b4a: 5b pop %rbx
400b4b: 5d pop %rbp
400b4c: 41 5c pop %r12
400b4e: 41 5d pop %r13
400b50: 41 5e pop %r14
400b52: 41 5f pop %r15
400b54: c3 retq
400b55: 66 66 2e 0f 1f 84 00 data32 nopw %cs:0x0(%rax,%rax,1)
400b5c: 00 00 00 00
0000000000400b60 <__libc_csu_fini>:
400b60: f3 c3 repz retq
Disassembly of section .fini:
0000000000400b64 <_fini>:
400b64: 48 83 ec 08 sub $0x8,%rsp
400b68: 48 83 c4 08 add $0x8,%rsp
400b6c: c3 retq
@assp1r1n3
Copy link
Author

The following code compiles in the most optimal way(even without extra movls!) on gcc 4.8.1. However, clang 3.4.1 fails to unroll the loop.

@assp1r1n3
Copy link
Author

NOTE

I use --std=gnu99 to be able to call random without any parameters. It has no influence on performance!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment