Skip to content

Instantly share code, notes, and snippets.

@kuribas
Created August 7, 2017 19:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuribas/ddd977968c3a56eac505901acb3e8eee to your computer and use it in GitHub Desktop.
Save kuribas/ddd977968c3a56eac505901acb3e8eee to your computer and use it in GitHub Desktop.
watertower problem in haskell, C and SSE
benchmarking rainfall/list
time 2.697 s (2.532 s .. 3.065 s)
0.998 R² (NaN R² .. 1.000 R²)
mean 2.771 s (2.712 s .. 2.800 s)
std dev 51.39 ms (0.0 s .. 51.77 ms)
variance introduced by outliers: 19% (moderately inflated)
benchmarking rainfall/unboxed vector
time 51.66 ms (51.59 ms .. 51.83 ms)
1.000 R² (1.000 R² .. 1.000 R²)
mean 19.44 ms (12.80 ms .. 24.12 ms)
std dev 6.520 ms (2.357 ms .. 8.719 ms)
variance introduced by outliers: 86% (severely inflated)
benchmarking rainfall/C
time 27.10 ms (27.06 ms .. 27.15 ms)
1.000 R² (1.000 R² .. 1.000 R²)
mean 11.23 ms (8.686 ms .. 13.19 ms)
std dev 3.755 ms (2.703 ms .. 5.055 ms)
variance introduced by outliers: 88% (severely inflated)
benchmarking rainfall/C sse
time 12.15 ms (12.13 ms .. 12.18 ms)
1.000 R² (1.000 R² .. 1.000 R²)
mean 3.530 ms (2.794 ms .. 4.123 ms)
std dev 1.059 ms (786.0 μs .. 1.469 ms)
variance introduced by outliers: 92% (severely inflated)
#include "smmintrin.h"
#include "malloc.h"
#include "stdlib.h"
#include "stdio.h"
#define MAX(x,y) (x > y ? x : y)
#define MIN(x,y) (x > y ? y : x)
unsigned int tower(unsigned int length, unsigned int *towers)
{
unsigned int *maxt = malloc(sizeof(unsigned int)*length);
int i, mx;
unsigned int sum;
mx = 0;
for(i = 0; i < length; i++)
{
mx = MAX(mx, towers[i]);
maxt[i] = mx;
}
mx = 0;
sum = 0;
for(i = length-1; i>=0; i--)
{
mx = MAX(mx, towers[i]);
sum += MIN(maxt[i], mx) - towers[i];
}
free(maxt);
return sum;
}
unsigned int tower2(unsigned int length, unsigned int *towers)
{
unsigned char *maxt, *towers8;
int i, length8;
__m128i vec1, vec2, vec3, sum;
length8 = ((length+15)/16);
towers8 = memalign(sizeof(__m128i), length8*16);
maxt = memalign(sizeof(__m128i), length8*16);
if(maxt == NULL || towers8 == NULL)
return 0;
/* copy from int32[] to uint8[] */
for(i = 0; i < length8-2; i++) {
vec1 = ((__m128i*)towers)[i*4];
vec2 = ((__m128i*)towers)[i*4+1];
vec1 = _mm_packus_epi32(vec1, vec2);
vec2 = ((__m128i*)towers)[i*4+2];
vec3 = ((__m128i*)towers)[i*4+3];
vec2 = _mm_packus_epi32(vec2, vec3);
vec1 = _mm_packus_epi16(vec1, vec2);
((__m128i*)towers8)[i] = vec1;
}
/* copy from int32[] to uint8[] */
for(i = i*16; i < length; i++) {
towers8[i] = (unsigned char)towers[i];
}
/* pad with zeros */
for(; i < length8*16; i++)
towers8[i] = 0;
vec2 = _mm_setzero_si128();
for(i = 0; i < length8; i++) {
vec1 = _mm_load_si128(((__m128i*)towers8) + i);
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,1,1,4,5,5,5,8,9,9,9,12,13,13,13));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,3,3,3,3,8,9,10,11,11,11,11,11));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(0,1,2,3,4,5,6,7,7,7,7,7,7,7,7,7));
vec1 = _mm_max_epu8(vec1, vec2);
((__m128i*)maxt)[i] = vec1;
vec2 = _mm_shuffle_epi8(vec1, _mm_setr_epi8(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15));
}
vec2 = _mm_setzero_si128();
sum = _mm_setzero_si128();
for(i = length8-1; i >= 0; i--) {
vec3 = _mm_load_si128(((__m128i*)towers8) + i);
vec1 = _mm_max_epu8(vec3, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(2,2,2,3,6,6,6,7,10,10,10,11,14,14,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(4,4,4,4,4,5,6,7,12,12,12,12,12,13,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_shuffle_epi8(vec1 ,_mm_setr_epi8(8,8,8,8,8,8,8,8,8,9,10,11,12,13,14,15));
vec1 = _mm_max_epu8(vec1, vec2);
vec2 = _mm_subs_epu8(_mm_min_epu8(((__m128i*)maxt)[i], vec1), vec3);
vec3 = _mm_cvtepu8_epi32(vec2);
sum = _mm_add_epi32(sum, vec3);
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1));
vec3 = _mm_cvtepu8_epi32(vec2);
sum = _mm_add_epi32(sum, vec3);
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1));
vec3 = _mm_cvtepu8_epi32(vec2);
sum = _mm_add_epi32(sum, vec3);
vec2 = _mm_shuffle_epi32(vec2, _MM_SHUFFLE(0, 3, 2, 1));
vec3 = _mm_cvtepu8_epi32(vec2);
sum = _mm_add_epi32(sum, vec3);
vec2 = _mm_shuffle_epi8(vec1, _mm_setzero_si128());
}
vec2 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
sum = _mm_add_epi32(vec2, sum);
vec2 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(2, 3, 0, 1));
sum = _mm_add_epi32(vec2, sum);
free(towers8);
free(maxt);
return _mm_cvtsi128_si32(sum);
}
import Criterion.Main
import System.Random
import Data.List
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Storable as S
import Foreign
import Foreign.C
import Foreign.C.Types
import System.IO.Unsafe
foreign import ccall "tower" tower :: CUInt -> Ptr CUInt -> IO CUInt
foreign import ccall "tower2" tower2 :: CUInt -> Ptr CUInt -> IO CUInt
rainfallC1 :: S.Vector CUInt -> CUInt
rainfallC1 l =
let (fptr, len) = S.unsafeToForeignPtr0 l
in unsafePerformIO $
withForeignPtr fptr $
tower (fromIntegral len)
rainfallC2 :: S.Vector CUInt -> CUInt
rainfallC2 l =
let (fptr, len) = S.unsafeToForeignPtr0 l
in unsafePerformIO $
withForeignPtr fptr $
tower2 (fromIntegral len)
rainfall :: U.Vector Int -> Int
rainfall xs = U.sum (U.zipWith (-) mins xs)
where
mins = U.zipWith min maxl maxr
maxl = U.scanl1' max xs
maxr = U.scanr1' max xs
rainfallL :: [Int] -> Int
rainfallL xs = sum (zipWith (-) mins xs)
where
mins = zipWith min maxl maxr
maxl = scanl1 max xs
maxr = scanr1 max xs
-- Our benchmark harness.
main = do
g <- getStdGen
let hs = take 10000000 (randomRs (0, 200) g :: [Int])
hv = U.fromList hs
cv = S.fromList $ map fromIntegral hs
defaultMain [
bgroup "rainfall" [
bench "list" $ whnf rainfallL hs,
bench "unboxed vector" $ whnf rainfall hv,
bench "C" $ whnf rainfallC1 cv,
bench "C sse" $ whnf rainfallC2 cv
]
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment