Last active
November 18, 2022 11:23
-
-
Save Mark1626/f4b7d4bb8cb37035a35b54eb82a56925 to your computer and use it in GitHub Desktop.
Min Max with x86 SIMD Intrinsics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
set -x | |
declare -a SIZE=(1024 2048 4096 8192 16384 32768) | |
# declare -a SIZE=(1024 2048) | |
rm -f runs.txt | |
for sz in "${SIZE[@]}" | |
do | |
echo "min-max $sz" | |
./min-max $sz >> runs.txt | |
done | |
cat runs.txt | awk ' \ | |
/Matrix dim/ { \ | |
size = $NF; \ | |
} \ | |
/Elapsed time golden/ { \ | |
golden = $(NF-1); \ | |
} \ | |
/Elapsed time SIMD SSE/ { \ | |
sse = $(NF-1); \ | |
} \ | |
/Elapsed time SIMD AVX/ { \ | |
avx = $(NF-1); \ | |
printf("%s, %s, %s, %s\n", size, golden, sse, avx); \ | |
} \ | |
' > stats.csv | |
echo " \ | |
reset; \ | |
set terminal png enhanced large font \"Helvetica,10\"; \ | |
\ | |
set title \"minMax Benchmark\"; \ | |
set xlabel \"Matrix Dim\"; \ | |
set ylabel \"Execution time(us)\"; \ | |
set key left top; \ | |
set logscale x; \ | |
set logscale y; \ | |
\ | |
plot \"stats.csv\" using 1:2 with linespoint title \"Golden\", \ | |
\"stats.csv\" using 1:3 with linespoint title \"SSE\", \ | |
\"stats.csv\" using 1:4 with linespoint title \"AVX\"; \ | |
" | gnuplot > performance.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
g++ -o min-max min-max.cc -Wall -Wextra -O3 -march=native | |
./min-max <N> | |
*/ | |
#include <algorithm> | |
#include <assert.h> | |
#include <climits> | |
#include <cstddef> | |
#include <ctime> | |
#include <emmintrin.h> | |
#include <float.h> | |
#include <immintrin.h> | |
#include <iostream> | |
#include <math.h> | |
#include <smmintrin.h> | |
#include <xmmintrin.h> | |
#include <limits.h> | |
#include <chrono> | |
class WallClock { | |
std::chrono::high_resolution_clock::time_point begin; | |
public: | |
WallClock() { begin = std::chrono::high_resolution_clock::now(); } | |
void tick() { begin = std::chrono::high_resolution_clock::now(); } | |
void reset() { begin = std::chrono::high_resolution_clock::now(); } | |
double elapsedTime() { | |
std::chrono::high_resolution_clock::time_point end = | |
std::chrono::high_resolution_clock::now(); | |
return std::chrono::duration_cast<std::chrono::microseconds>(end - begin) | |
.count(); | |
} | |
}; | |
void assert_int(size_t expected, size_t actual, std::string str) { | |
if (expected != actual) { | |
std::cerr << str << " expected: " << expected << " actual: " << actual | |
<< std::endl; | |
assert(expected == actual); | |
} | |
} | |
void assert_float(float expected, float actual, std::string str) { | |
float diff = fabs(expected - actual); | |
float loss = (diff / expected) * 100; | |
if (loss > 0.1) { | |
std::cerr << str << " expected: " << expected << " actual: " << actual | |
<< " loss: " << loss << std::endl; | |
assert(loss < 0.1); | |
} | |
} | |
void golden(float *arr, size_t N, float &min, size_t &minPos, float &max, | |
size_t &maxPos) { | |
min = FLT_MAX; | |
max = FLT_MIN; | |
#pragma omp parallel for | |
for (size_t i = 0; i < N; i++) { | |
if (min > arr[i]) { | |
#pragma omp critical | |
{ | |
min = arr[i]; | |
minPos = i; | |
} | |
} | |
if (max < arr[i]) { | |
#pragma omp critical | |
{ | |
max = arr[i]; | |
maxPos = i; | |
} | |
} | |
} | |
} | |
void print_vector(const __m128i v) { | |
uint32_t temp[4]; | |
_mm_storeu_si128((__m128i_u *)temp, v); | |
for (int i = 0; i < 4; i++) { | |
std::cout << temp[i] << " "; | |
} | |
std::cout << std::endl; | |
} | |
void print_vector(const __m128 v) { | |
float temp[4]; | |
_mm_storeu_ps(temp, v); | |
for (int i = 0; i < 4; i++) { | |
std::cout << temp[i] << " "; | |
} | |
std::cout << std::endl; | |
} | |
void simd_sse(float *arr, size_t N, float &min, size_t &minPos, float &max, | |
size_t &maxPos) { | |
assert(N < (size_t)INT_MAX); | |
const int simd_width = 4; | |
__m128 arr_r = _mm_loadu_ps(arr); | |
__m128 max_r = arr_r; | |
__m128 min_r = arr_r; | |
__m128i idx_r = _mm_setr_epi32(0, 1, 2, 3); | |
__m128i min_pos_r = idx_r; | |
__m128i max_pos_r = idx_r; | |
__m128i inc = _mm_set1_epi32(simd_width); | |
for (size_t i = simd_width; i < N; i += simd_width) { | |
idx_r = _mm_add_epi32(idx_r, inc); | |
arr_r = _mm_loadu_ps(arr + i); | |
__m128i min_mask = _mm_castps_si128(_mm_cmplt_ps(arr_r, min_r)); | |
__m128i max_mask = _mm_castps_si128(_mm_cmpgt_ps(arr_r, max_r)); | |
min_r = _mm_min_ps(min_r, arr_r); | |
min_pos_r = _mm_blendv_epi8(min_pos_r, idx_r, min_mask); | |
max_r = _mm_max_ps(max_r, arr_r); | |
max_pos_r = _mm_blendv_epi8(max_pos_r, idx_r, max_mask); | |
} | |
// Unload from vector register and reduce | |
float max_tmp[simd_width]; | |
float min_tmp[simd_width]; | |
uint32_t max_pos_temp[simd_width]; | |
uint32_t min_pos_temp[simd_width]; | |
_mm_storeu_ps(min_tmp, min_r); | |
_mm_storeu_si128((__m128i_u *)min_pos_temp, min_pos_r); | |
_mm_storeu_ps(max_tmp, max_r); | |
_mm_storeu_si128((__m128i_u *)max_pos_temp, max_pos_r); | |
max = max_tmp[0]; | |
maxPos = max_pos_temp[0]; | |
min = min_tmp[0]; | |
minPos = min_pos_temp[0]; | |
for (int i = 1; i < simd_width; i++) { | |
if (max_tmp[i] > max) { | |
max = max_tmp[i]; | |
maxPos = max_pos_temp[i]; | |
} else if (max_tmp[i] == max) { | |
maxPos = std::min(maxPos, size_t(max_pos_temp[i])); | |
} | |
if (min_tmp[i] < min) { | |
min = min_tmp[i]; | |
minPos = min_pos_temp[i]; | |
} else if (min_tmp[i] == min) { | |
minPos = std::min(minPos, size_t(min_pos_temp[i])); | |
} | |
} | |
} | |
void simd_avx(float *arr, size_t N, float &min, size_t &minPos, float &max, | |
size_t &maxPos) { | |
assert(N < (size_t)INT_MAX); | |
const int simd_width = 8; | |
__m256 arr_r = _mm256_loadu_ps(arr); | |
__m256 max_r = arr_r; | |
__m256 min_r = arr_r; | |
__m256i idx_r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); | |
__m256i min_pos_r = idx_r; | |
__m256i max_pos_r = idx_r; | |
__m256i inc = _mm256_set1_epi32(simd_width); | |
for (size_t i = simd_width; i < N; i += simd_width) { | |
idx_r = _mm256_add_epi32(idx_r, inc); | |
arr_r = _mm256_loadu_ps(arr + i); | |
// _CMP_LT_OS | |
__m256i min_mask = _mm256_castps_si256(_mm256_cmp_ps(arr_r, min_r, _CMP_LT_OQ)); | |
__m256i max_mask = _mm256_castps_si256(_mm256_cmp_ps(arr_r, max_r, _CMP_GT_OQ)); | |
min_r = _mm256_min_ps(min_r, arr_r); | |
min_pos_r = _mm256_blendv_epi8(min_pos_r, idx_r, min_mask); | |
max_r = _mm256_max_ps(max_r, arr_r); | |
max_pos_r = _mm256_blendv_epi8(max_pos_r, idx_r, max_mask); | |
} | |
// print_vector(max_r); | |
// print_vector(max_pos_r); | |
// Unload from vector register and reduce | |
float max_tmp[simd_width]; | |
float min_tmp[simd_width]; | |
uint32_t max_pos_temp[simd_width]; | |
uint32_t min_pos_temp[simd_width]; | |
_mm256_storeu_ps(min_tmp, min_r); | |
_mm256_storeu_si256((__m256i_u *)min_pos_temp, min_pos_r); | |
_mm256_storeu_ps(max_tmp, max_r); | |
_mm256_storeu_si256((__m256i_u *)max_pos_temp, max_pos_r); | |
max = max_tmp[0]; | |
maxPos = max_pos_temp[0]; | |
min = min_tmp[0]; | |
minPos = min_pos_temp[0]; | |
for (int i = 1; i < simd_width; i++) { | |
if (max_tmp[i] > max) { | |
max = max_tmp[i]; | |
maxPos = max_pos_temp[i]; | |
} else if (max_tmp[i] == max) { | |
maxPos = std::min(maxPos, size_t(max_pos_temp[i])); | |
} | |
if (min_tmp[i] < min) { | |
min = min_tmp[i]; | |
minPos = min_pos_temp[i]; | |
} else if (min_tmp[i] == min) { | |
minPos = std::min(minPos, size_t(min_pos_temp[i])); | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
if (argc < 2) { | |
std::cerr << " usage: ./min-max <N>" << std::endl; | |
return 1; | |
} | |
size_t x = atoi(argv[1]); | |
size_t N = x * x; | |
float *arr = new float[N]; | |
time_t seed = time(0); | |
// time_t seed = 1668765940; | |
std::cout << "Seed: " << seed << std::endl; | |
srand(seed); | |
float offset = 5.0f; | |
float range = 1000.0f; | |
for (size_t i = 0; i < N; i++) { | |
arr[i] = offset + range * (rand() / (float)RAND_MAX); | |
} | |
std::cout << "Matrix dim: " << x << std::endl; | |
float minExpected = 0.0f, maxExpected = 0.0f; | |
size_t minPosExpected = 0, maxPosExpected = 0; | |
{ | |
WallClock t; | |
golden(arr, N, minExpected, minPosExpected, maxExpected, maxPosExpected); | |
std::cout << "Elapsed time golden: " << t.elapsedTime() << " us" | |
<< std::endl; | |
} | |
std::cout << "min: " << minExpected << " max " << maxExpected << " minPos " | |
<< minPosExpected << " maxPos: " << maxPosExpected << std::endl; | |
float minActual = 0.0f, maxActual = 0.0f; | |
size_t minPosActual = 0, maxPosActual = 0; | |
{ | |
WallClock t; | |
simd_sse(arr, N, minActual, minPosActual, maxActual, maxPosActual); | |
std::cout << "Elapsed time SIMD SSE: " << t.elapsedTime() << " us" | |
<< std::endl; | |
} | |
assert_float(maxExpected, maxActual, "max"); | |
assert_float(minExpected, minActual, "min"); | |
assert_int(minPosExpected, minPosActual, "min_pos"); | |
assert_int(maxPosExpected, maxPosActual, "max_pos"); | |
minActual = 0.0f, maxActual = 0.0f; | |
minPosActual = 0, maxPosActual = 0; | |
{ | |
WallClock t; | |
simd_avx(arr, N, minActual, minPosActual, maxActual, maxPosActual); | |
std::cout << "Elapsed time SIMD AVX: " << t.elapsedTime() << " us" | |
<< std::endl; | |
} | |
assert_float(maxExpected, maxActual, "max"); | |
assert_float(minExpected, minActual, "min"); | |
assert_int(minPosExpected, minPosActual, "min_pos"); | |
assert_int(maxPosExpected, maxPosActual, "max_pos"); | |
delete[] arr; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment