Skip to content

Instantly share code, notes, and snippets.

@sa1
Created November 11, 2011 22:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sa1/1359457 to your computer and use it in GitHub Desktop.
Save sa1/1359457 to your computer and use it in GitHub Desktop.
Hamming Weight Timer for 64 bit Intel PCs (SSE 4.2)
// Hamming Weight timer on 64 bit PCs.
#include <stdio.h>
#include <smmintrin.h>
#include <stdint.h>
#include <limits.h>
#define PROC_SPEED 3410884000.0
#define NO_INSTRUCTIONS 1000000000
#define TIME(function) do { \
start = timer(); \
function(); \
stop = timer(); \
printf("Cpu Cycles taken %lld\n",stop-start); \
printf("Time Taken(at 3.4Ghz) = %fs \n", (stop-start)/PROC_SPEED); \
printf("---------- \n"); \
} while (0)
inline uint64_t timer() {
uint32_t lo, hi;
__asm__ __volatile__ (
"xorl %%eax, %%eax\n"
"cpuid\n"
"rdtsc\n"
: "=a" (lo), "=d" (hi)
:
: "%ebx", "%ecx");
return (uint64_t)hi << 32 | lo;
}
//Intel 64 bit SSE4.2(Nehalem onwards) popcnt instruction
int intelpopcnt()
{
int i, j;
for(i = 0; i < NO_INSTRUCTIONS; i++)
{
j = _mm_popcnt_u64(i);
}
}
//GCC built in count.
int builtincount()
{
int i, j;
for(i = 0; i < NO_INSTRUCTIONS; i++)
{
j = __builtin_popcount(i);
}
}
//Swar Algorithm
int swar()
{
int i, j, k;
for(k = 0; k < NO_INSTRUCTIONS; k++)
{
i = k;
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
j = (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}
}
//Trivial Count
int trivialcount()
{
int i, j, k;
for(i = 0; i < NO_INSTRUCTIONS; i++)
{
j = 0, k = i;
while(k > 0){
if (k % 2 == 1) j++;
k /= 2;
}
}
}
//Wegner Algorithm
int wegner() {
int i, j, k;
for(i = 0; i < NO_INSTRUCTIONS; i++)
{
k = i;
uint64_t count;
for (count=0; k; count++)
k &= k - 1;
}
}
int main()
{
unsigned long long start, stop;
printf("GCC built in algorithm:\n");
TIME(builtincount);
printf("Intel SSE 4.2 instructions:\n");
TIME(intelpopcnt);
printf("Wegner Algorithm:\n");
TIME(wegner);
printf("SWAR Algorithm:\n");
TIME(swar);
printf("Trivial algorithm:\n");
TIME(trivialcount);
}
.file "test.c"
.text
.p2align 4,,15
.globl timer
.type timer, @function
timer:
.LFB643:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
popq %rbx
.cfi_def_cfa_offset 8
mov %eax, %esi
movq %rdx, %rax
salq $32, %rax
orq %rsi, %rax
ret
.cfi_endproc
.LFE643:
.size timer, .-timer
.p2align 4,,15
.globl intelpopcnt
.type intelpopcnt, @function
intelpopcnt:
.LFB644:
.cfi_startproc
ret
.cfi_endproc
.LFE644:
.size intelpopcnt, .-intelpopcnt
.p2align 4,,15
.globl builtincount
.type builtincount, @function
builtincount:
.LFB645:
.cfi_startproc
ret
.cfi_endproc
.LFE645:
.size builtincount, .-builtincount
.p2align 4,,15
.globl swar
.type swar, @function
swar:
.LFB646:
.cfi_startproc
ret
.cfi_endproc
.LFE646:
.size swar, .-swar
.p2align 4,,15
.globl trivialcount
.type trivialcount, @function
trivialcount:
.LFB647:
.cfi_startproc
movl no_instructions(%rip), %ecx
xorl %edx, %edx
testl %ecx, %ecx
jle .L11
.L13:
addl $1, %edx
cmpl %ecx, %edx
je .L11
.p2align 4,,10
.p2align 3
.L15:
testl %edx, %edx
jle .L13
movl %edx, %eax
.p2align 4,,10
.p2align 3
.L8:
sarl %eax
jne .L8
addl $1, %edx
cmpl %ecx, %edx
jne .L15
.L11:
.p2align 4,,2
rep
ret
.cfi_endproc
.LFE647:
.size trivialcount, .-trivialcount
.p2align 4,,15
.globl wegner
.type wegner, @function
wegner:
.LFB648:
.cfi_startproc
movl no_instructions(%rip), %esi
xorl %ecx, %ecx
testl %esi, %esi
jle .L22
.L24:
addl $1, %ecx
cmpl %esi, %ecx
je .L22
.p2align 4,,10
.p2align 3
.L25:
testl %ecx, %ecx
je .L24
movl %ecx, %eax
.p2align 4,,10
.p2align 3
.L19:
leal -1(%rax), %edx
andl %edx, %eax
jne .L19
addl $1, %ecx
cmpl %esi, %ecx
jne .L25
.L22:
rep
ret
.cfi_endproc
.LFE648:
.size wegner, .-wegner
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC0:
.string "Enter the number of instructions"
.section .rodata.str1.1,"aMS",@progbits,1
.LC1:
.string "%d"
.LC2:
.string "GCC built in algorithm:"
.LC3:
.string "Cpu Cycles taken %lld\n"
.LC5:
.string "Time Taken(at 3.4Ghz) = %fs \n"
.LC6:
.string "---------- "
.LC7:
.string "Intel SSE 4.2 instructions:"
.LC8:
.string "Wegner Algorithm:"
.LC9:
.string "SWAR Algorithm:"
.LC10:
.string "Trivial algorithm:"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB649:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $.LC0, %edi
xorl %eax, %eax
call printf
movl $no_instructions, %esi
movl $.LC1, %edi
xorl %eax, %eax
call __isoc99_scanf
movl $.LC2, %edi
call puts
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movl %edx, %r8d
mov %eax, %edi
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movq %rdx, %rbx
mov %eax, %eax
salq $32, %r8
salq $32, %rbx
orq %rdi, %r8
movl $.LC3, %edi
orq %rax, %rbx
xorl %eax, %eax
subq %r8, %rbx
movq %rbx, %rsi
call printf
testq %rbx, %rbx
js .L27
vcvtsi2sdq %rbx, %xmm0, %xmm0
.L28:
movl $.LC5, %edi
movl $1, %eax
vdivsd .LC4(%rip), %xmm0, %xmm0
call printf
movl $.LC6, %edi
call puts
movl $.LC7, %edi
call puts
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movl %edx, %r8d
mov %eax, %edi
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movq %rdx, %rbx
mov %eax, %eax
salq $32, %r8
salq $32, %rbx
orq %rdi, %r8
movl $.LC3, %edi
orq %rax, %rbx
xorl %eax, %eax
subq %r8, %rbx
movq %rbx, %rsi
call printf
testq %rbx, %rbx
js .L29
vcvtsi2sdq %rbx, %xmm0, %xmm0
.L30:
movl $.LC5, %edi
movl $1, %eax
vdivsd .LC4(%rip), %xmm0, %xmm0
call printf
movl $.LC6, %edi
call puts
movl $.LC8, %edi
call puts
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movl no_instructions(%rip), %esi
movq %rdx, %rdi
mov %eax, %eax
salq $32, %rdi
xorl %ecx, %ecx
orq %rax, %rdi
testl %esi, %esi
jle .L32
.L58:
addl $1, %ecx
cmpl %esi, %ecx
je .L32
.p2align 4,,10
.p2align 3
.L59:
testl %ecx, %ecx
je .L58
movl %ecx, %eax
.p2align 4,,10
.p2align 3
.L33:
leal -1(%rax), %edx
andl %edx, %eax
jne .L33
addl $1, %ecx
cmpl %esi, %ecx
jne .L59
.L32:
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movq %rdx, %rbx
mov %eax, %eax
salq $32, %rbx
orq %rax, %rbx
xorl %eax, %eax
subq %rdi, %rbx
movl $.LC3, %edi
movq %rbx, %rsi
call printf
testq %rbx, %rbx
js .L36
vcvtsi2sdq %rbx, %xmm0, %xmm0
.L37:
movl $.LC5, %edi
movl $1, %eax
vdivsd .LC4(%rip), %xmm0, %xmm0
call printf
movl $.LC6, %edi
call puts
movl $.LC9, %edi
call puts
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movl %edx, %r8d
mov %eax, %edi
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movq %rdx, %rbx
mov %eax, %eax
salq $32, %r8
salq $32, %rbx
orq %rdi, %r8
movl $.LC3, %edi
orq %rax, %rbx
xorl %eax, %eax
subq %r8, %rbx
movq %rbx, %rsi
call printf
testq %rbx, %rbx
js .L38
vcvtsi2sdq %rbx, %xmm0, %xmm0
.L39:
movl $.LC5, %edi
movl $1, %eax
vdivsd .LC4(%rip), %xmm0, %xmm0
call printf
movl $.LC6, %edi
call puts
movl $.LC10, %edi
call puts
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movl no_instructions(%rip), %ecx
movq %rdx, %rdi
mov %eax, %eax
salq $32, %rdi
xorl %edx, %edx
orq %rax, %rdi
testl %ecx, %ecx
jle .L41
.L57:
addl $1, %edx
cmpl %ecx, %edx
je .L41
.p2align 4,,10
.p2align 3
.L60:
testl %edx, %edx
jle .L57
movl %edx, %eax
.p2align 4,,10
.p2align 3
.L42:
sarl %eax
jne .L42
addl $1, %edx
cmpl %ecx, %edx
jne .L60
.L41:
#APP
# 21 "test.c" 1
xorl %eax, %eax
cpuid
rdtsc
# 0 "" 2
#NO_APP
movq %rdx, %rbx
mov %eax, %eax
salq $32, %rbx
orq %rax, %rbx
xorl %eax, %eax
subq %rdi, %rbx
movl $.LC3, %edi
movq %rbx, %rsi
call printf
testq %rbx, %rbx
js .L45
vcvtsi2sdq %rbx, %xmm0, %xmm0
.L46:
movl $.LC5, %edi
movl $1, %eax
vdivsd .LC4(%rip), %xmm0, %xmm0
call printf
movl $.LC6, %edi
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 8
jmp puts
.L27:
.cfi_restore_state
movq %rbx, %rax
andl $1, %ebx
shrq %rax
orq %rbx, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vaddsd %xmm0, %xmm0, %xmm0
jmp .L28
.L45:
movq %rbx, %rax
andl $1, %ebx
shrq %rax
orq %rbx, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vaddsd %xmm0, %xmm0, %xmm0
jmp .L46
.L38:
movq %rbx, %rax
andl $1, %ebx
shrq %rax
orq %rbx, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vaddsd %xmm0, %xmm0, %xmm0
jmp .L39
.L36:
movq %rbx, %rax
andl $1, %ebx
shrq %rax
orq %rbx, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vaddsd %xmm0, %xmm0, %xmm0
jmp .L37
.L29:
movq %rbx, %rax
andl $1, %ebx
shrq %rax
orq %rbx, %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
vaddsd %xmm0, %xmm0, %xmm0
jmp .L30
.cfi_endproc
.LFE649:
.size main, .-main
.comm no_instructions,4,16
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC4:
.long 3019898880
.long 1105815998
.ident "GCC: (GNU) 4.6.2"
.section .note.GNU-stack,"",@progbits
1.) gcc hammingweight.c -msse4.2 -O3 -o test
GCC built in algorithm:
Cpu Cycles taken 112
Time Taken(at 3.4Ghz) = 0.000000s
----------
Intel SSE 4.2 instructions:
Cpu Cycles taken 108
Time Taken(at 3.4Ghz) = 0.000000s
----------
Wegner Algorithm:
Cpu Cycles taken 35458726968
Time Taken(at 3.4Ghz) = 10.395759s
----------
SWAR Algorithm:
Cpu Cycles taken 112
Time Taken(at 3.4Ghz) = 0.000000s
----------
Trivial algorithm:
Cpu Cycles taken 57869795080
Time Taken(at 3.4Ghz) = 16.966216s
2.) gcc hammingweight.c -msse4.2 -O0 -o test
GCC built in algorithm:
Cpu Cycles taken 7171683088
Time Taken(at 3.4Ghz) = 2.102588s
----------
Intel SSE 4.2 instructions:
Cpu Cycles taken 8513914876
Time Taken(at 3.4Ghz) = 2.496102s
----------
Wegner Algorithm:
Cpu Cycles taken 109668089404
Time Taken(at 3.4Ghz) = 32.152395s
----------
SWAR Algorithm:
Cpu Cycles taken 14472015088
Time Taken(at 3.4Ghz) = 4.242893s
----------
Trivial algorithm:
Cpu Cycles taken 515784137756
Time Taken(at 3.4Ghz) = 151.217144s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment