Skip to content

Instantly share code, notes, and snippets.

@ChrisLundquist
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ChrisLundquist/7f98cd7d120abb6edaa3 to your computer and use it in GitHub Desktop.
Save ChrisLundquist/7f98cd7d120abb6edaa3 to your computer and use it in GitHub Desktop.
$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S -march=corei7 main.c
# Gives you the popcnt assembly, GCC knows we have the instruction
$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S main.c
# Gives you the mask assembly
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define ELEMENT_COUNT 1000000000
void generate_data(short* data, int seed) {
srand(seed);
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
data[i] = (short) rand();
}
}
void generate_table(char* table) {
for(unsigned i = 0; i < 256; ++i)
table[i] = __builtin_popcount(i);
}
int main(int argc, char* argv[]) {
short* data = malloc( ELEMENT_COUNT * sizeof(short));
uint64_t sum = 0;
int seed = 0; /* so we can reproduce the output */
srand(seed);
char* table = malloc( 256 * sizeof(char));
generate_table(table);
generate_data(data, seed);
for(uint64_t i = 0; i < ELEMENT_COUNT * (sizeof(short) / sizeof(char)); ++i) {
sum += table[((char*) data)[i]]; /* walk byte at a time */
}
printf("Set bit count was: %llu\n", sum);
free(data);
return 0;
}
.section __TEXT,__text,regular,pure_instructions
.globl _generate_data
.align 4, 0x90
_generate_data: ## @generate_data
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
Ltmp6:
.cfi_offset %rbx, -32
Ltmp7:
.cfi_offset %r14, -24
movq %rdi, %r14
movl %esi, %edi
callq _srand
xorl %ebx, %ebx
.align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
callq _rand
movw %ax, (%r14,%rbx,2)
incq %rbx
cmpq $1000000000, %rbx ## imm = 0x3B9ACA00
jne LBB0_1
## BB#2:
popq %rbx
popq %r14
popq %rbp
ret
.cfi_endproc
.globl _generate_table
.align 4, 0x90
_generate_table: ## @generate_table
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp10:
.cfi_def_cfa_offset 16
Ltmp11:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp12:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
.align 4, 0x90
LBB1_1: ## =>This Inner Loop Header: Depth=1
movl %eax, %ecx
shrl %ecx
andl $1431655765, %ecx ## imm = 0x55555555
movl %eax, %edx
subl %ecx, %edx
movl %edx, %ecx
andl $858993459, %ecx ## imm = 0x33333333
shrl $2, %edx
andl $858993459, %edx ## imm = 0x33333333
addl %ecx, %edx
movl %edx, %ecx
shrl $4, %ecx
addl %edx, %ecx
andl $252645135, %ecx ## imm = 0xF0F0F0F
imull $16843009, %ecx, %ecx ## imm = 0x1010101
shrl $24, %ecx
movb %cl, (%rdi,%rax)
incq %rax
cmpq $255, %rax
jne LBB1_1
## BB#2:
popq %rbp
ret
.cfi_endproc
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp16:
.cfi_def_cfa_offset 16
Ltmp17:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp18:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
pushq %rax
Ltmp19:
.cfi_offset %rbx, -56
Ltmp20:
.cfi_offset %r12, -48
Ltmp21:
.cfi_offset %r13, -40
Ltmp22:
.cfi_offset %r14, -32
Ltmp23:
.cfi_offset %r15, -24
movl $2000000000, %edi ## imm = 0x77359400
callq _malloc
movq %rax, %r14
xorl %ebx, %ebx
xorl %edi, %edi
callq _srand
movl $255, %edi
callq _malloc
movq %rax, %r15
.align 4, 0x90
LBB2_1: ## =>This Inner Loop Header: Depth=1
movl %ebx, %eax
shrl %eax
andl $1431655765, %eax ## imm = 0x55555555
movl %ebx, %ecx
subl %eax, %ecx
movl %ecx, %eax
andl $858993459, %eax ## imm = 0x33333333
shrl $2, %ecx
andl $858993459, %ecx ## imm = 0x33333333
addl %eax, %ecx
movl %ecx, %eax
shrl $4, %eax
addl %ecx, %eax
andl $252645135, %eax ## imm = 0xF0F0F0F
imull $16843009, %eax, %eax ## imm = 0x1010101
shrl $24, %eax
movb %al, (%r15,%rbx)
incq %rbx
cmpq $255, %rbx
jne LBB2_1
## BB#2: ## %generate_table.exit
xorl %r13d, %r13d
xorl %edi, %edi
callq _srand
movl $1, %r12d
xorl %ebx, %ebx
.align 4, 0x90
LBB2_3: ## =>This Inner Loop Header: Depth=1
callq _rand
movw %ax, (%r14,%rbx,2)
incq %rbx
cmpq $1000000000, %rbx ## imm = 0x3B9ACA00
jne LBB2_3
## BB#4:
xorl %esi, %esi
.align 4, 0x90
LBB2_5: ## %vector.body
## =>This Inner Loop Header: Depth=1
movq %rsi, %rax
movq %r13, %rcx
movsbq -1(%r14,%r12), %rdx
movsbq (%r14,%r12), %rsi
movsbq (%r15,%rdx), %r13
movsbq (%r15,%rsi), %rsi
addq %rcx, %r13
addq %rax, %rsi
addq $2, %r12
cmpq $2000000001, %r12 ## imm = 0x77359401
jne LBB2_5
## BB#6: ## %middle.block
addq %r13, %rsi
leaq L_.str(%rip), %rdi
xorl %eax, %eax
callq _printf
movq %r14, %rdi
callq _free
xorl %eax, %eax
addq $8, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
ret
.cfi_endproc
.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "Set bit count was: %llu\n"
.subsections_via_symbols
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define ELEMENT_COUNT 10000000
int main(int argc, char* argv[]) {
short* data = malloc( ELEMENT_COUNT * sizeof(short));
uint64_t sum = 0;
int seed = 0; /* so we can reproduce the output */
srand(seed);
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
data[i] = (short) rand();
}
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
sum += __builtin_popcount(data[i]);
}
printf("Set bit count was: %llu\n", sum);
free(data);
return 0;
}
.section __TEXT,__text,regular,pure_instructions
.globl _generate_data
.align 4, 0x90
_generate_data: ## @generate_data
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
popq %rbp
ret
.cfi_endproc
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp8:
.cfi_def_cfa_offset 16
Ltmp9:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp10:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %rbx
pushq %rax
Ltmp11:
.cfi_offset %rbx, -40
Ltmp12:
.cfi_offset %r14, -32
Ltmp13:
.cfi_offset %r15, -24
movl $20000000, %edi ## imm = 0x1312D00
callq _malloc
movq %rax, %r14
xorl %r15d, %r15d
xorl %edi, %edi
callq _srand
xorl %ebx, %ebx
.align 4, 0x90
LBB1_1: ## =>This Inner Loop Header: Depth=1
callq _rand
movw %ax, (%r14,%rbx,2)
incq %rbx
cmpq $10000000, %rbx ## imm = 0x989680
jne LBB1_1
## BB#2:
xorl %esi, %esi
.align 4, 0x90
LBB1_3: ## %.preheader
## =>This Inner Loop Header: Depth=1
movq %rsi, %rax
movswl (%r14,%r15,2), %ecx
movl %ecx, %edx
shrl %edx
andl $1431655765, %edx ## imm = 0x55555555
subl %edx, %ecx
movl %ecx, %edx
andl $858993459, %edx ## imm = 0x33333333
shrl $2, %ecx
andl $858993459, %ecx ## imm = 0x33333333
addl %edx, %ecx
movl %ecx, %edx
shrl $4, %edx
addl %ecx, %edx
andl $252645135, %edx ## imm = 0xF0F0F0F
imull $16843009, %edx, %esi ## imm = 0x1010101
shrl $24, %esi
addq %rax, %rsi
incq %r15
cmpq $10000000, %r15 ## imm = 0x989680
jne LBB1_3
## BB#4:
leaq L_.str(%rip), %rdi
xorl %eax, %eax
callq _printf
movq %r14, %rdi
callq _free
xorl %eax, %eax
addq $8, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.cfi_endproc
.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "Set bit count was: %llu\n"
.subsections_via_symbols
.section __TEXT,__text,regular,pure_instructions
.globl _generate_data
.align 4, 0x90
_generate_data: ## @generate_data
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
popq %rbp
ret
.cfi_endproc
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp8:
.cfi_def_cfa_offset 16
Ltmp9:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp10:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %rbx
pushq %rax
Ltmp11:
.cfi_offset %rbx, -40
Ltmp12:
.cfi_offset %r14, -32
Ltmp13:
.cfi_offset %r15, -24
movl $20000000, %edi ## imm = 0x1312D00
callq _malloc
movq %rax, %r14
xorl %r15d, %r15d
xorl %edi, %edi
callq _srand
xorl %ebx, %ebx
.align 4, 0x90
LBB1_1: ## =>This Inner Loop Header: Depth=1
callq _rand
movw %ax, (%r14,%rbx,2)
incq %rbx
cmpq $10000000, %rbx ## imm = 0x989680
jne LBB1_1
## BB#2:
xorl %esi, %esi
.align 4, 0x90
LBB1_3: ## %.preheader
## =>This Inner Loop Header: Depth=1
movq %rsi, %rax
movswl (%r14,%r15,2), %ecx
popcntl %ecx, %esi
addq %rax, %rsi
incq %r15
cmpq $10000000, %r15 ## imm = 0x989680
jne LBB1_3
## BB#4:
leaq L_.str(%rip), %rdi
xorl %eax, %eax
callq _printf
movq %r14, %rdi
callq _free
xorl %eax, %eax
addq $8, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.cfi_endproc
.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "Set bit count was: %llu\n"
.subsections_via_symbols
@ChrisLundquist
Copy link
Author

Some casual benchmarking (increasing the array size by 100x for both tests) shows it is about 10% faster. I was expecting more.

# Masks
Mjolnir:google clundquist$ time ./a.out 
Set bit count was: 15999732481

real    0m9.937s
user    0m9.255s
sys 0m0.680s
Mjolnir:google clundquist$ time ./a.out 
Set bit count was: 15999732481

real    0m9.898s
user    0m9.195s
sys 0m0.699s
# popcnt
Mjolnir:google clundquist$ time ./a.out
Set bit count was: 15999732481

real    0m9.114s
user    0m8.407s
sys 0m0.705s
Mjolnir:google clundquist$ time ./a.out
Set bit count was: 15999732481

real    0m9.140s
user    0m8.452s
sys 0m0.684s

@ChrisLundquist
Copy link
Author

Added lookup.c which uses a lookup table. It ran significantly faster, and I was curious as to why. Turns out it "cheated" and the auto vectorizer was doing movq. I did the same thing with popcnt casting our short* to an uint64_t* and popcnt won again.

At some point though, the latency of the popcount instruction for large register sizes might exceed the latency of the lookup table (since it will probably be in L1 or L2). The autovectorizer complicates our analysis though because you can do this task in parallel many different ways. Partitioning the data set into jobs, using OpenCL or something to launch a bunch of threads, etc.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment