Last active
August 29, 2015 14:10
-
-
Save manodeep/05c65432cddf34dc11ea to your computer and use it in GitHub Desktop.
Wrapper C code to run the various optimizations outlined here http://blogs.msdn.com/b/oldnewthing/archive/2014/12/01/10576992.aspx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################################################################################################################ | |
# Function compiler -O0 -O3 -O3 -Ofast -O3 -funroll-loops -O3 -Ofast -funroll-loops -O3 -Ofast -funroll-all-loops | |
################################################################################################################################################################################################ | |
countthem gcc 1.000x 1.000x 1.000x 1.000x 1.000x 1.000x | |
countthem_ternary gcc 1.669x 3.614x 4.397x 5.271x 6.009x 6.399x | |
countthem_simd gcc 3.414x 4.166x 4.273x 6.013x 7.117x 7.030x | |
countthem_simd_unroll4 gcc 3.608x 5.745x 5.543x 7.510x 7.600x 7.567x | |
countthem_simd_unroll5_gt gcc 3.731x 6.659x 7.444x 6.223x 7.413x 7.389x | |
countthem_simd_unroll5_add gcc 3.572x 7.220x 7.596x 7.285x 6.368x 6.510x | |
countthem_simd_unroll4_add gcc 3.726x 7.095x 7.556x 7.619x 6.079x 6.112x | |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | |
countthem clang 1.000x 1.000x 1.000x 1.000x 1.000x 1.000x | |
countthem_ternary clang 1.823x 0.947x 1.109x 1.035x 0.994x 1.032x | |
countthem_simd clang 4.094x 1.034x 1.188x 1.022x 1.204x 0.944x | |
countthem_simd_unroll4 clang 4.399x 1.118x 1.376x 1.206x 1.354x 1.215x | |
countthem_simd_unroll5_gt clang 5.151x 1.207x 1.423x 1.154x 1.321x 1.286x | |
countthem_simd_unroll5_add clang 5.085x 1.240x 1.221x 1.175x 1.309x 1.264x | |
countthem_simd_unroll4_add clang 4.728x 1.099x 1.377x 1.058x 1.398x 1.269x | |
################################################################################################################################################################################################ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Wrapper C code to run the various optimizations outlined here | |
http://blogs.msdn.com/b/oldnewthing/archive/2014/12/01/10576992.aspx | |
Compile: $(CC) -std=c99 counting.c -o counting [-O3] | |
Running: ./counting | |
Output: Shows the various functions, and the improvement | |
over the base case for different compiler options. | |
Observations: With gcc and -O3, I see up to 6x speedup. | |
With clang and -O3, I see max. 1.25x speedup. | |
Written by: Manodeep Sinha (Dec 2, 2014) | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <math.h> | |
#include <sys/time.h> | |
#include <xmmintrin.h> | |
#define NMAX 100000 | |
int array[NMAX]; | |
#define ADD_DIFF_TIME(t1,t0) fabs((t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec-t0.tv_usec)) | |
#define MAXLEN 200 | |
int countthem(int boundary) | |
{ | |
int count = 0; | |
for (int i = 0; i < NMAX; i++) { | |
if (array[i] < boundary) count++; | |
} | |
return count; | |
} | |
int countthem_ternary(int boundary) | |
{ | |
int count = 0; | |
for (int i = 0; i < NMAX; i++) { | |
count += (array[i] < boundary) ? 1:0; | |
} | |
return count; | |
} | |
int countthem_simd(int boundary) | |
{ | |
__m128i *xarray = (__m128i*)array; | |
__m128i xboundary = _mm_set1_epi32(boundary); | |
__m128i count = _mm_setzero_si128(); | |
for (int i = 0; i < NMAX/4 ; i++) { | |
__m128i value = _mm_loadu_si128(&xarray[i]); | |
__m128i test = _mm_cmplt_epi32(value, xboundary); | |
count = _mm_sub_epi32(count, test); | |
} | |
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2)); | |
count = _mm_add_epi32(count, shuffle1); | |
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1)); | |
count = _mm_add_epi32(count, shuffle2); | |
return _mm_cvtsi128_si32(count); | |
} | |
int countthem_simd_unroll4(int boundary) | |
{ | |
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n]) | |
#define GETTEST(n) __m128i test##n = _mm_cmplt_epi32(value##n, xboundary) | |
#define GETCOUNT(n) count = _mm_sub_epi32(count, test##n) | |
__m128i *xarray = (__m128i*)array; | |
__m128i xboundary = _mm_set1_epi32(boundary); | |
__m128i count = _mm_setzero_si128(); | |
for (int i = 0; i < NMAX/4; i += 4) { | |
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); | |
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); | |
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); | |
} | |
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2)); | |
count = _mm_add_epi32(count, shuffle1); | |
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1)); | |
count = _mm_add_epi32(count, shuffle2); | |
return _mm_cvtsi128_si32(count); | |
#undef GETVALUE | |
#undef GETTEST | |
#undef GETCOUNT | |
} | |
int countthem_simd_unroll5_gt(int boundary) | |
{ | |
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n]) | |
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1) | |
#define GETCOUNT(n) count = _mm_sub_epi32(count, test##n) | |
__m128i *xarray = (__m128i*)array; | |
__m128i xboundary1 = _mm_set1_epi32(boundary - 1); | |
__m128i count = _mm_setzero_si128(); | |
for (int i = 0; i < NMAX / 4; i += 5) { | |
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); GETVALUE(4); | |
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); GETTEST(4); | |
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); GETCOUNT(4); | |
} | |
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2)); | |
count = _mm_add_epi32(count, shuffle1); | |
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1)); | |
count = _mm_add_epi32(count, shuffle2); | |
return NMAX - _mm_cvtsi128_si32(count); | |
#undef GETVALUE | |
#undef GETTEST | |
#undef GETCOUNT | |
} | |
int countthem_simd_unroll5_add(int boundary) | |
{ | |
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n]) | |
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1) | |
#define GETCOUNT(n) count = _mm_add_epi32(count, test##n) | |
__m128i *xarray = (__m128i*)array; | |
__m128i xboundary1 = _mm_set1_epi32(boundary - 1); | |
__m128i count = _mm_setzero_si128(); | |
for (int i = 0; i < NMAX / 4; i += 5) { | |
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); GETVALUE(4); | |
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); GETTEST(4); | |
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); GETCOUNT(4); | |
} | |
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2)); | |
count = _mm_add_epi32(count, shuffle1); | |
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1)); | |
count = _mm_add_epi32(count, shuffle2); | |
return NMAX + _mm_cvtsi128_si32(count); | |
#undef GETVALUE | |
#undef GETTEST | |
#undef GETCOUNT | |
} | |
int countthem_simd_unroll4_add(int boundary) | |
{ | |
#define GETVALUE(n) __m128i value##n = _mm_loadu_si128(&xarray[i+n]) | |
#define GETTEST(n) __m128i test##n = _mm_cmpgt_epi32(value##n, xboundary1) | |
#define GETCOUNT(n) count = _mm_add_epi32(count, test##n) | |
__m128i *xarray = (__m128i*)array; | |
__m128i xboundary1 = _mm_set1_epi32(boundary - 1); | |
__m128i count = _mm_setzero_si128(); | |
for (int i = 0; i < NMAX / 4; i += 4) { | |
GETVALUE(0); GETVALUE(1); GETVALUE(2); GETVALUE(3); | |
GETTEST(0); GETTEST(1); GETTEST(2); GETTEST(3); | |
GETCOUNT(0); GETCOUNT(1); GETCOUNT(2); GETCOUNT(3); | |
} | |
__m128i shuffle1 = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 0, 3, 2)); | |
count = _mm_add_epi32(count, shuffle1); | |
__m128i shuffle2 = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 3, 0, 1)); | |
count = _mm_add_epi32(count, shuffle2); | |
return NMAX + _mm_cvtsi128_si32(count); | |
#undef GETVALUE | |
#undef GETTEST | |
#undef GETCOUNT | |
} | |
double run_them(int (*func)(const int)) | |
{ | |
volatile int count = 0; | |
double total_time = 0.0; | |
const int max_boundary = 11; | |
const int max_iterations=100; | |
for (int boundary = 0; boundary < max_boundary ; boundary++) { | |
struct timeval t0,t1; | |
gettimeofday(&t0,NULL); | |
volatile int count = 0; | |
for(int iterations=0;iterations < max_iterations;iterations++) { | |
count += (*func)(boundary); | |
} | |
gettimeofday(&t1,NULL); | |
total_time += ADD_DIFF_TIME(t1,t0); | |
} | |
double avg_time = total_time / max_iterations; | |
return avg_time; | |
} | |
int main(int argc,char **argv) | |
{ | |
const char allfunction_names[][MAXLEN] = {"countthem","countthem_ternary", "countthem_simd", "countthem_simd_unroll4", "countthem_simd_unroll5_gt", "countthem_simd_unroll5_add", "countthem_simd_unroll4_add"}; | |
const int ntests = sizeof(allfunction_names)/(sizeof(char)*MAXLEN); | |
int (*allfunctions[]) (int) = {countthem,countthem_ternary, countthem_simd, countthem_simd_unroll4, countthem_simd_unroll5_gt, countthem_simd_unroll5_add, countthem_simd_unroll4_add}; | |
for (int i = 0; i < NMAX; i++) array[i] = rand() % 10; | |
for(int i=0;i<ntests;i++) { | |
double this_case = run_them(allfunctions[i]); | |
double base_case; | |
if (i==0) base_case = this_case; | |
//fprintf(stdout,"%-40s %12.6e %10.3lfx\n", allfunction_names[i], this_case, base_case/this_case); | |
fprintf(stdout,"%-40s %10.3lfx\n", allfunction_names[i], base_case/this_case); | |
} | |
fflush(stdout); | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
compilers=('gcc' 'clang') | |
compiler_options=('-O0' '-O3' '-O3 -Ofast' '-O3 -funroll-loops' '-O3 -Ofast -funroll-loops' '-O3 -Ofast -funroll-all-loops') | |
base='counting' | |
outfile='out.txt' | |
tmpfile='tmp.txt' | |
tmpfile1='yy.txt' | |
tmpfile2='xx.txt' | |
benchfile='bench.txt' | |
#### Note that next line truncates the benchmark file | |
printf "" > $benchfile | |
totlen=44 | |
for i in $(seq 1 $totlen); do | |
printf %c "#" >> $benchfile | |
done | |
for options in "${compiler_options[@]}"; do | |
len=${#options} | |
totlen=$[$totlen+$len] | |
for i in $(seq 1 $len); do | |
printf %c "#" >> $benchfile | |
done | |
printf "##########" >> $benchfile | |
totlen=$((totlen+10)) | |
done | |
printf "\n" >> $benchfile | |
printf "# Function compiler " >> $benchfile | |
for options in "${compiler_options[@]}"; do | |
printf '%s %12s' "$options" " " >> $benchfile | |
done | |
printf "\n" >> $benchfile | |
for i in $(seq 1 $totlen); do | |
printf %c '#' >> $benchfile | |
done | |
printf "\n" >> $benchfile | |
last=${compilers[@]:(-1)} | |
for compiler in "${compilers[@]}"; do | |
index=0 | |
## truncate the temporary file (no headers, only contains function, compiler and improvements | |
printf %c "" > $tmpfile | |
for options in "${compiler_options[@]}"; do | |
### Compile the code | |
execstring="$compiler -std=c99 -o $base $base.c $options" | |
eval $execstring | |
### Run the code. Redirect output to some temporary file (truncate) | |
execstring="./$base > $outfile" | |
eval $execstring | |
### Now create the new column | |
if [ $index -gt 0 ]; then | |
printf "" > $tmpfile1 | |
while read -r line | |
do | |
a=( $line ) | |
len=${#options} | |
printf '%5s%*s' " " $((len+5)) ${a[1]} >> $tmpfile1 | |
printf "\n" >> $tmpfile1 | |
done < "$outfile" | |
paste -d ' ' $tmpfile $tmpfile1 > $tmpfile2 | |
execstring="mv $tmpfile2 $tmpfile" | |
eval $execstring | |
else | |
while read -r line | |
do | |
a=( $line ) | |
len=${#options} | |
printf '%-26s %6s %4s %*s' ${a[0]} "$compiler" " " $((len+5)) ${a[1]} >> $tmpfile | |
printf "\n" >> $tmpfile | |
done < "$outfile" | |
fi | |
index=$((index+1)) | |
done | |
printf "\n" >>$tmpfile | |
#### Only write out this divider if it's not the last compiler option | |
if [ "$compiler" != "$last" ]; then | |
for i in $(seq 1 $totlen); do | |
printf %c "-" >> $tmpfile | |
done | |
printf "\n" >>$tmpfile | |
fi | |
execstring="cat $tmpfile >> $benchfile" | |
eval $execstring | |
done | |
execstring="rm -f $tmpfile $outfile $tmpfile1 $tmpfile2" | |
eval $execstring | |
### Output the final finishing line marker | |
for i in $(seq 1 $totlen); do | |
printf %c "#" >> $benchfile | |
done | |
printf "\n" >> $benchfile | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment