Created
November 11, 2011 22:04
-
-
Save sa1/1359457 to your computer and use it in GitHub Desktop.
Hamming Weight Timer for 64 bit Intel PCs (SSE 4.2)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Hamming Weight timer on 64 bit PCs. | |
#include <stdio.h> | |
#include <smmintrin.h> | |
#include <stdint.h> | |
#include <limits.h> | |
#define PROC_SPEED 3410884000.0 | |
#define NO_INSTRUCTIONS 1000000000 | |
#define TIME(function) do { \ | |
start = timer(); \ | |
function(); \ | |
stop = timer(); \ | |
printf("Cpu Cycles taken %lld\n",stop-start); \ | |
printf("Time Taken(at 3.4Ghz) = %fs \n", (stop-start)/PROC_SPEED); \ | |
printf("---------- \n"); \ | |
} while (0) | |
inline uint64_t timer() { | |
uint32_t lo, hi; | |
__asm__ __volatile__ ( | |
"xorl %%eax, %%eax\n" | |
"cpuid\n" | |
"rdtsc\n" | |
: "=a" (lo), "=d" (hi) | |
: | |
: "%ebx", "%ecx"); | |
return (uint64_t)hi << 32 | lo; | |
} | |
//Intel 64 bit SSE4.2(Nehalem onwards) popcnt instruction | |
int intelpopcnt() | |
{ | |
int i, j; | |
for(i = 0; i < NO_INSTRUCTIONS; i++) | |
{ | |
j = _mm_popcnt_u64(i); | |
} | |
} | |
//GCC built in count. | |
int builtincount() | |
{ | |
int i, j; | |
for(i = 0; i < NO_INSTRUCTIONS; i++) | |
{ | |
j = __builtin_popcount(i); | |
} | |
} | |
//Swar Algorithm | |
int swar() | |
{ | |
int i, j, k; | |
for(k = 0; k < NO_INSTRUCTIONS; k++) | |
{ | |
i = k; | |
i = i - ((i >> 1) & 0x55555555); | |
i = (i & 0x33333333) + ((i >> 2) & 0x33333333); | |
j = (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; | |
} | |
} | |
//Trivial Count | |
int trivialcount() | |
{ | |
int i, j, k; | |
for(i = 0; i < NO_INSTRUCTIONS; i++) | |
{ | |
j = 0, k = i; | |
while(k > 0){ | |
if (k % 2 == 1) j++; | |
k /= 2; | |
} | |
} | |
} | |
//Wegner Algorithm | |
int wegner() { | |
int i, j, k; | |
for(i = 0; i < NO_INSTRUCTIONS; i++) | |
{ | |
k = i; | |
uint64_t count; | |
for (count=0; k; count++) | |
k &= k - 1; | |
} | |
} | |
int main() | |
{ | |
unsigned long long start, stop; | |
printf("GCC built in algorithm:\n"); | |
TIME(builtincount); | |
printf("Intel SSE 4.2 instructions:\n"); | |
TIME(intelpopcnt); | |
printf("Wegner Algorithm:\n"); | |
TIME(wegner); | |
printf("SWAR Algorithm:\n"); | |
TIME(swar); | |
printf("Trivial algorithm:\n"); | |
TIME(trivialcount); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.file "test.c" | |
.text | |
.p2align 4,,15 | |
.globl timer | |
.type timer, @function | |
timer: | |
.LFB643: | |
.cfi_startproc | |
pushq %rbx | |
.cfi_def_cfa_offset 16 | |
.cfi_offset 3, -16 | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
popq %rbx | |
.cfi_def_cfa_offset 8 | |
mov %eax, %esi | |
movq %rdx, %rax | |
salq $32, %rax | |
orq %rsi, %rax | |
ret | |
.cfi_endproc | |
.LFE643: | |
.size timer, .-timer | |
.p2align 4,,15 | |
.globl intelpopcnt | |
.type intelpopcnt, @function | |
intelpopcnt: | |
.LFB644: | |
.cfi_startproc | |
ret | |
.cfi_endproc | |
.LFE644: | |
.size intelpopcnt, .-intelpopcnt | |
.p2align 4,,15 | |
.globl builtincount | |
.type builtincount, @function | |
builtincount: | |
.LFB645: | |
.cfi_startproc | |
ret | |
.cfi_endproc | |
.LFE645: | |
.size builtincount, .-builtincount | |
.p2align 4,,15 | |
.globl swar | |
.type swar, @function | |
swar: | |
.LFB646: | |
.cfi_startproc | |
ret | |
.cfi_endproc | |
.LFE646: | |
.size swar, .-swar | |
.p2align 4,,15 | |
.globl trivialcount | |
.type trivialcount, @function | |
trivialcount: | |
.LFB647: | |
.cfi_startproc | |
movl no_instructions(%rip), %ecx | |
xorl %edx, %edx | |
testl %ecx, %ecx | |
jle .L11 | |
.L13: | |
addl $1, %edx | |
cmpl %ecx, %edx | |
je .L11 | |
.p2align 4,,10 | |
.p2align 3 | |
.L15: | |
testl %edx, %edx | |
jle .L13 | |
movl %edx, %eax | |
.p2align 4,,10 | |
.p2align 3 | |
.L8: | |
sarl %eax | |
jne .L8 | |
addl $1, %edx | |
cmpl %ecx, %edx | |
jne .L15 | |
.L11: | |
.p2align 4,,2 | |
rep | |
ret | |
.cfi_endproc | |
.LFE647: | |
.size trivialcount, .-trivialcount | |
.p2align 4,,15 | |
.globl wegner | |
.type wegner, @function | |
wegner: | |
.LFB648: | |
.cfi_startproc | |
movl no_instructions(%rip), %esi | |
xorl %ecx, %ecx | |
testl %esi, %esi | |
jle .L22 | |
.L24: | |
addl $1, %ecx | |
cmpl %esi, %ecx | |
je .L22 | |
.p2align 4,,10 | |
.p2align 3 | |
.L25: | |
testl %ecx, %ecx | |
je .L24 | |
movl %ecx, %eax | |
.p2align 4,,10 | |
.p2align 3 | |
.L19: | |
leal -1(%rax), %edx | |
andl %edx, %eax | |
jne .L19 | |
addl $1, %ecx | |
cmpl %esi, %ecx | |
jne .L25 | |
.L22: | |
rep | |
ret | |
.cfi_endproc | |
.LFE648: | |
.size wegner, .-wegner | |
.section .rodata.str1.8,"aMS",@progbits,1 | |
.align 8 | |
.LC0: | |
.string "Enter the number of instructions" | |
.section .rodata.str1.1,"aMS",@progbits,1 | |
.LC1: | |
.string "%d" | |
.LC2: | |
.string "GCC built in algorithm:" | |
.LC3: | |
.string "Cpu Cycles taken %lld\n" | |
.LC5: | |
.string "Time Taken(at 3.4Ghz) = %fs \n" | |
.LC6: | |
.string "---------- " | |
.LC7: | |
.string "Intel SSE 4.2 instructions:" | |
.LC8: | |
.string "Wegner Algorithm:" | |
.LC9: | |
.string "SWAR Algorithm:" | |
.LC10: | |
.string "Trivial algorithm:" | |
.section .text.startup,"ax",@progbits | |
.p2align 4,,15 | |
.globl main | |
.type main, @function | |
main: | |
.LFB649: | |
.cfi_startproc | |
pushq %rbx | |
.cfi_def_cfa_offset 16 | |
.cfi_offset 3, -16 | |
movl $.LC0, %edi | |
xorl %eax, %eax | |
call printf | |
movl $no_instructions, %esi | |
movl $.LC1, %edi | |
xorl %eax, %eax | |
call __isoc99_scanf | |
movl $.LC2, %edi | |
call puts | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movl %edx, %r8d | |
mov %eax, %edi | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movq %rdx, %rbx | |
mov %eax, %eax | |
salq $32, %r8 | |
salq $32, %rbx | |
orq %rdi, %r8 | |
movl $.LC3, %edi | |
orq %rax, %rbx | |
xorl %eax, %eax | |
subq %r8, %rbx | |
movq %rbx, %rsi | |
call printf | |
testq %rbx, %rbx | |
js .L27 | |
vcvtsi2sdq %rbx, %xmm0, %xmm0 | |
.L28: | |
movl $.LC5, %edi | |
movl $1, %eax | |
vdivsd .LC4(%rip), %xmm0, %xmm0 | |
call printf | |
movl $.LC6, %edi | |
call puts | |
movl $.LC7, %edi | |
call puts | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movl %edx, %r8d | |
mov %eax, %edi | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movq %rdx, %rbx | |
mov %eax, %eax | |
salq $32, %r8 | |
salq $32, %rbx | |
orq %rdi, %r8 | |
movl $.LC3, %edi | |
orq %rax, %rbx | |
xorl %eax, %eax | |
subq %r8, %rbx | |
movq %rbx, %rsi | |
call printf | |
testq %rbx, %rbx | |
js .L29 | |
vcvtsi2sdq %rbx, %xmm0, %xmm0 | |
.L30: | |
movl $.LC5, %edi | |
movl $1, %eax | |
vdivsd .LC4(%rip), %xmm0, %xmm0 | |
call printf | |
movl $.LC6, %edi | |
call puts | |
movl $.LC8, %edi | |
call puts | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movl no_instructions(%rip), %esi | |
movq %rdx, %rdi | |
mov %eax, %eax | |
salq $32, %rdi | |
xorl %ecx, %ecx | |
orq %rax, %rdi | |
testl %esi, %esi | |
jle .L32 | |
.L58: | |
addl $1, %ecx | |
cmpl %esi, %ecx | |
je .L32 | |
.p2align 4,,10 | |
.p2align 3 | |
.L59: | |
testl %ecx, %ecx | |
je .L58 | |
movl %ecx, %eax | |
.p2align 4,,10 | |
.p2align 3 | |
.L33: | |
leal -1(%rax), %edx | |
andl %edx, %eax | |
jne .L33 | |
addl $1, %ecx | |
cmpl %esi, %ecx | |
jne .L59 | |
.L32: | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movq %rdx, %rbx | |
mov %eax, %eax | |
salq $32, %rbx | |
orq %rax, %rbx | |
xorl %eax, %eax | |
subq %rdi, %rbx | |
movl $.LC3, %edi | |
movq %rbx, %rsi | |
call printf | |
testq %rbx, %rbx | |
js .L36 | |
vcvtsi2sdq %rbx, %xmm0, %xmm0 | |
.L37: | |
movl $.LC5, %edi | |
movl $1, %eax | |
vdivsd .LC4(%rip), %xmm0, %xmm0 | |
call printf | |
movl $.LC6, %edi | |
call puts | |
movl $.LC9, %edi | |
call puts | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movl %edx, %r8d | |
mov %eax, %edi | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movq %rdx, %rbx | |
mov %eax, %eax | |
salq $32, %r8 | |
salq $32, %rbx | |
orq %rdi, %r8 | |
movl $.LC3, %edi | |
orq %rax, %rbx | |
xorl %eax, %eax | |
subq %r8, %rbx | |
movq %rbx, %rsi | |
call printf | |
testq %rbx, %rbx | |
js .L38 | |
vcvtsi2sdq %rbx, %xmm0, %xmm0 | |
.L39: | |
movl $.LC5, %edi | |
movl $1, %eax | |
vdivsd .LC4(%rip), %xmm0, %xmm0 | |
call printf | |
movl $.LC6, %edi | |
call puts | |
movl $.LC10, %edi | |
call puts | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movl no_instructions(%rip), %ecx | |
movq %rdx, %rdi | |
mov %eax, %eax | |
salq $32, %rdi | |
xorl %edx, %edx | |
orq %rax, %rdi | |
testl %ecx, %ecx | |
jle .L41 | |
.L57: | |
addl $1, %edx | |
cmpl %ecx, %edx | |
je .L41 | |
.p2align 4,,10 | |
.p2align 3 | |
.L60: | |
testl %edx, %edx | |
jle .L57 | |
movl %edx, %eax | |
.p2align 4,,10 | |
.p2align 3 | |
.L42: | |
sarl %eax | |
jne .L42 | |
addl $1, %edx | |
cmpl %ecx, %edx | |
jne .L60 | |
.L41: | |
#APP | |
# 21 "test.c" 1 | |
xorl %eax, %eax | |
cpuid | |
rdtsc | |
# 0 "" 2 | |
#NO_APP | |
movq %rdx, %rbx | |
mov %eax, %eax | |
salq $32, %rbx | |
orq %rax, %rbx | |
xorl %eax, %eax | |
subq %rdi, %rbx | |
movl $.LC3, %edi | |
movq %rbx, %rsi | |
call printf | |
testq %rbx, %rbx | |
js .L45 | |
vcvtsi2sdq %rbx, %xmm0, %xmm0 | |
.L46: | |
movl $.LC5, %edi | |
movl $1, %eax | |
vdivsd .LC4(%rip), %xmm0, %xmm0 | |
call printf | |
movl $.LC6, %edi | |
popq %rbx | |
.cfi_remember_state | |
.cfi_def_cfa_offset 8 | |
jmp puts | |
.L27: | |
.cfi_restore_state | |
movq %rbx, %rax | |
andl $1, %ebx | |
shrq %rax | |
orq %rbx, %rax | |
vcvtsi2sdq %rax, %xmm0, %xmm0 | |
vaddsd %xmm0, %xmm0, %xmm0 | |
jmp .L28 | |
.L45: | |
movq %rbx, %rax | |
andl $1, %ebx | |
shrq %rax | |
orq %rbx, %rax | |
vcvtsi2sdq %rax, %xmm0, %xmm0 | |
vaddsd %xmm0, %xmm0, %xmm0 | |
jmp .L46 | |
.L38: | |
movq %rbx, %rax | |
andl $1, %ebx | |
shrq %rax | |
orq %rbx, %rax | |
vcvtsi2sdq %rax, %xmm0, %xmm0 | |
vaddsd %xmm0, %xmm0, %xmm0 | |
jmp .L39 | |
.L36: | |
movq %rbx, %rax | |
andl $1, %ebx | |
shrq %rax | |
orq %rbx, %rax | |
vcvtsi2sdq %rax, %xmm0, %xmm0 | |
vaddsd %xmm0, %xmm0, %xmm0 | |
jmp .L37 | |
.L29: | |
movq %rbx, %rax | |
andl $1, %ebx | |
shrq %rax | |
orq %rbx, %rax | |
vcvtsi2sdq %rax, %xmm0, %xmm0 | |
vaddsd %xmm0, %xmm0, %xmm0 | |
jmp .L30 | |
.cfi_endproc | |
.LFE649: | |
.size main, .-main | |
.comm no_instructions,4,16 | |
.section .rodata.cst8,"aM",@progbits,8 | |
.align 8 | |
.LC4: | |
.long 3019898880 | |
.long 1105815998 | |
.ident "GCC: (GNU) 4.6.2" | |
.section .note.GNU-stack,"",@progbits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1.) gcc hammingweight.c -msse4.2 -O3 -o test | |
GCC built in algorithm: | |
Cpu Cycles taken 112 | |
Time Taken(at 3.4Ghz) = 0.000000s | |
---------- | |
Intel SSE 4.2 instructions: | |
Cpu Cycles taken 108 | |
Time Taken(at 3.4Ghz) = 0.000000s | |
---------- | |
Wegner Algorithm: | |
Cpu Cycles taken 35458726968 | |
Time Taken(at 3.4Ghz) = 10.395759s | |
---------- | |
SWAR Algorithm: | |
Cpu Cycles taken 112 | |
Time Taken(at 3.4Ghz) = 0.000000s | |
---------- | |
Trivial algorithm: | |
Cpu Cycles taken 57869795080 | |
Time Taken(at 3.4Ghz) = 16.966216s | |
2.) gcc hammingweight.c -msse4.2 -O0 -o test | |
GCC built in algorithm: | |
Cpu Cycles taken 7171683088 | |
Time Taken(at 3.4Ghz) = 2.102588s | |
---------- | |
Intel SSE 4.2 instructions: | |
Cpu Cycles taken 8513914876 | |
Time Taken(at 3.4Ghz) = 2.496102s | |
---------- | |
Wegner Algorithm: | |
Cpu Cycles taken 109668089404 | |
Time Taken(at 3.4Ghz) = 32.152395s | |
---------- | |
SWAR Algorithm: | |
Cpu Cycles taken 14472015088 | |
Time Taken(at 3.4Ghz) = 4.242893s | |
---------- | |
Trivial algorithm: | |
Cpu Cycles taken 515784137756 | |
Time Taken(at 3.4Ghz) = 151.217144s | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment