Skip to content

Instantly share code, notes, and snippets.

@manodeep
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save manodeep/05c65432cddf34dc11ea to your computer and use it in GitHub Desktop.
Save manodeep/05c65432cddf34dc11ea to your computer and use it in GitHub Desktop.
Wrapper C code to run the various optimizations outlined here http://blogs.msdn.com/b/oldnewthing/archive/2014/12/01/10576992.aspx
################################################################################################################################################################################################
# Function compiler -O0 -O3 -O3 -Ofast -O3 -funroll-loops -O3 -Ofast -funroll-loops -O3 -Ofast -funroll-all-loops
################################################################################################################################################################################################
countthem gcc 1.000x 1.000x 1.000x 1.000x 1.000x 1.000x
countthem_ternary gcc 1.669x 3.614x 4.397x 5.271x 6.009x 6.399x
countthem_simd gcc 3.414x 4.166x 4.273x 6.013x 7.117x 7.030x
countthem_simd_unroll4 gcc 3.608x 5.745x 5.543x 7.510x 7.600x 7.567x
countthem_simd_unroll5_gt gcc 3.731x 6.659x 7.444x 6.223x 7.413x 7.389x
countthem_simd_unroll5_add gcc 3.572x 7.220x 7.596x 7.285x 6.368x 6.510x
countthem_simd_unroll4_add gcc 3.726x 7.095x 7.556x 7.619x 6.079x 6.112x
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
countthem clang 1.000x 1.000x 1.000x 1.000x 1.000x 1.000x
countthem_ternary clang 1.823x 0.947x 1.109x 1.035x 0.994x 1.032x
countthem_simd clang 4.094x 1.034x 1.188x 1.022x 1.204x 0.944x
countthem_simd_unroll4 clang 4.399x 1.118x 1.376x 1.206x 1.354x 1.215x
countthem_simd_unroll5_gt clang 5.151x 1.207x 1.423x 1.154x 1.321x 1.286x
countthem_simd_unroll5_add clang 5.085x 1.240x 1.221x 1.175x 1.309x 1.264x
countthem_simd_unroll4_add clang 4.728x 1.099x 1.377x 1.058x 1.398x 1.269x
################################################################################################################################################################################################
/*
Wrapper C code to run the various optimizations outlined here
http://blogs.msdn.com/b/oldnewthing/archive/2014/12/01/10576992.aspx
Compile: $(CC) -std=c99 counting.c -o counting [-O3]
Running: ./counting
Output: Shows the various functions, and the improvement
over the base case for different compiler options.
Observations: With gcc and -O3, I see up to 6x speedup.
With clang and -O3, I see max. 1.25x speedup.
Written by: Manodeep Sinha (Dec 2, 2014)
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <sys/time.h>
#include <xmmintrin.h>
#define NMAX 100000
int array[NMAX];
#define ADD_DIFF_TIME(t1,t0) fabs((t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec-t0.tv_usec))
#define MAXLEN 200
int countthem(int boundary)
{
int count = 0;
for (int i = 0; i < NMAX; i++) {
if (array[i] < boundary) count++;
}
return count;
}
int countthem_ternary(int boundary)
{
int count = 0;
for (int i = 0; i < NMAX; i++) {
count += (array[i] < boundary) ? 1:0;
}
return count;
}
int countthem_simd(int boundary)
{
__m128i *xarray = (__m128i*)array;
__m128i xboundary = _mm_set1_epi32(boundary);
__m128i count = _mm_setzero_si128();
for (int i = 0; i < NMAX/4 ; i++) {
__m128i value = _mm_loadu_si128(&xarray[i]);
__m128i test = _mm_cmplt_epi32(value, xboundary);
count = _mm_sub_epi32(count, test);
}
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_epi32(count, shuffle1);
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_epi32(count, shuffle2);
return _mm_cvtsi128_si32(count);
}
int countthem_simd_unroll4(int boundary)
{
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n])
#define GETTEST(n) __m128i test##n = _mm_cmplt_epi32(value##n, xboundary)
#define GETCOUNT(n) count = _mm_sub_epi32(count, test##n)
__m128i *xarray = (__m128i*)array;
__m128i xboundary = _mm_set1_epi32(boundary);
__m128i count = _mm_setzero_si128();
for (int i = 0; i < NMAX/4; i += 4) {
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3);
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3);
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3);
}
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_epi32(count, shuffle1);
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_epi32(count, shuffle2);
return _mm_cvtsi128_si32(count);
#undef GETVALUE
#undef GETTEST
#undef GETCOUNT
}
int countthem_simd_unroll5_gt(int boundary)
{
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n])
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1)
#define GETCOUNT(n) count = _mm_sub_epi32(count, test##n)
__m128i *xarray = (__m128i*)array;
__m128i xboundary1 = _mm_set1_epi32(boundary - 1);
__m128i count = _mm_setzero_si128();
for (int i = 0; i < NMAX / 4; i += 5) {
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); GETVALUE(4);
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); GETTEST(4);
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); GETCOUNT(4);
}
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_epi32(count, shuffle1);
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_epi32(count, shuffle2);
return NMAX - _mm_cvtsi128_si32(count);
#undef GETVALUE
#undef GETTEST
#undef GETCOUNT
}
int countthem_simd_unroll5_add(int boundary)
{
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n])
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1)
#define GETCOUNT(n) count = _mm_add_epi32(count, test##n)
__m128i *xarray = (__m128i*)array;
__m128i xboundary1 = _mm_set1_epi32(boundary - 1);
__m128i count = _mm_setzero_si128();
for (int i = 0; i < NMAX / 4; i += 5) {
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); GETVALUE(4);
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); GETTEST(4);
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); GETCOUNT(4);
}
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_epi32(count, shuffle1);
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_epi32(count, shuffle2);
return NMAX + _mm_cvtsi128_si32(count);
#undef GETVALUE
#undef GETTEST
#undef GETCOUNT
}
int countthem_simd_unroll4_add(int boundary)
{
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n])
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1)
#define GETCOUNT(n) count = _mm_add_epi32(count, test##n)
__m128i *xarray = (__m128i*)array;
__m128i xboundary1 = _mm_set1_epi32(boundary - 1);
__m128i count = _mm_setzero_si128();
for (int i = 0; i < NMAX / 4; i += 4) {
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3);
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3);
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3);
}
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_epi32(count, shuffle1);
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_epi32(count, shuffle2);
return NMAX + _mm_cvtsi128_si32(count);
#undef GETVALUE
#undef GETTEST
#undef GETCOUNT
}
double run_them(int (*func)(const int))
{
volatile int count = 0;
double total_time = 0.0;
const int max_boundary = 11;
const int max_iterations=100;
for (int boundary = 0; boundary < max_boundary ; boundary++) {
struct timeval t0,t1;
gettimeofday(&t0,NULL);
volatile int count = 0;
for(int iterations=0;iterations < max_iterations;iterations++) {
count += (*func)(boundary);
}
gettimeofday(&t1,NULL);
total_time += ADD_DIFF_TIME(t1,t0);
}
double avg_time = total_time / max_iterations;
return avg_time;
}
int main(int argc,char **argv)
{
const char allfunction_names[][MAXLEN] = {"countthem","countthem_ternary", "countthem_simd", "countthem_simd_unroll4", "countthem_simd_unroll5_gt", "countthem_simd_unroll5_add", "countthem_simd_unroll4_add"};
const int ntests = sizeof(allfunction_names)/(sizeof(char)*MAXLEN);
int (*allfunctions[]) (int) = {countthem,countthem_ternary, countthem_simd, countthem_simd_unroll4, countthem_simd_unroll5_gt, countthem_simd_unroll5_add, countthem_simd_unroll4_add};
for (int i = 0; i < NMAX; i++) array[i] = rand() % 10;
for(int i=0;i<ntests;i++) {
double this_case = run_them(allfunctions[i]);
double base_case;
if (i==0) base_case = this_case;
//fprintf(stdout,"%-40s %12.6e %10.3lfx\n", allfunction_names[i], this_case, base_case/this_case);
fprintf(stdout,"%-40s %10.3lfx\n", allfunction_names[i], base_case/this_case);
}
fflush(stdout);
return EXIT_SUCCESS;
}
#!/bin/bash
compilers=('gcc' 'clang')
compiler_options=('-O0' '-O3' '-O3 -Ofast' '-O3 -funroll-loops' '-O3 -Ofast -funroll-loops' '-O3 -Ofast -funroll-all-loops')
base='counting'
outfile='out.txt'
tmpfile='tmp.txt'
tmpfile1='yy.txt'
tmpfile2='xx.txt'
benchfile='bench.txt'
#### Note that next line truncates the benchmark file
printf "" > $benchfile
totlen=44
for i in $(seq 1 $totlen); do
printf %c "#" >> $benchfile
done
for options in "${compiler_options[@]}"; do
len=${#options}
totlen=$[$totlen+$len]
for i in $(seq 1 $len); do
printf %c "#" >> $benchfile
done
printf "##########" >> $benchfile
totlen=$((totlen+10))
done
printf "\n" >> $benchfile
printf "# Function compiler " >> $benchfile
for options in "${compiler_options[@]}"; do
printf '%s %12s' "$options" " " >> $benchfile
done
printf "\n" >> $benchfile
for i in $(seq 1 $totlen); do
printf %c '#' >> $benchfile
done
printf "\n" >> $benchfile
last=${compilers[@]:(-1)}
for compiler in "${compilers[@]}"; do
index=0
## truncate the temporary file (no headers, only contains function, compiler and improvements
printf %c "" > $tmpfile
for options in "${compiler_options[@]}"; do
### Compile the code
execstring="$compiler -std=c99 -o $base $base.c $options"
eval $execstring
### Run the code. Redirect output to some temporary file (truncate)
execstring="./$base > $outfile"
eval $execstring
### Now create the new column
if [ $index -gt 0 ]; then
printf "" > $tmpfile1
while read -r line
do
a=( $line )
len=${#options}
printf '%5s%*s' " " $((len+5)) ${a[1]} >> $tmpfile1
printf "\n" >> $tmpfile1
done < "$outfile"
paste -d ' ' $tmpfile $tmpfile1 > $tmpfile2
execstring="mv $tmpfile2 $tmpfile"
eval $execstring
else
while read -r line
do
a=( $line )
len=${#options}
printf '%-26s %6s %4s %*s' ${a[0]} "$compiler" " " $((len+5)) ${a[1]} >> $tmpfile
printf "\n" >> $tmpfile
done < "$outfile"
fi
index=$((index+1))
done
printf "\n" >>$tmpfile
#### Only write out this divider if it's not the last compiler option
if [ "$compiler" != "$last" ]; then
for i in $(seq 1 $totlen); do
printf %c "-" >> $tmpfile
done
printf "\n" >>$tmpfile
fi
execstring="cat $tmpfile >> $benchfile"
eval $execstring
done
execstring="rm -f $tmpfile $outfile $tmpfile1 $tmpfile2"
eval $execstring
### Output the final finishing line marker
for i in $(seq 1 $totlen); do
printf %c "#" >> $benchfile
done
printf "\n" >> $benchfile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment