Skip to content

Instantly share code, notes, and snippets.

@kbenzie
Last active September 3, 2021 12:08
Show Gist options
  • Save kbenzie/5323248 to your computer and use it in GitHub Desktop.
Save kbenzie/5323248 to your computer and use it in GitHub Desktop.
Example of using SSE instructions showing the increase in performance.
#include <xmmintrin.h>
#include <cmath>
#include <iostream>
#include <omp.h>
/*
* void example()
* {
* // Size of float array
* const int size = 4;
*
* #ifndef DYNAMIC
* // Each float array processed by SSE instruction should have
* // 16-byte alignment.
* __declspec(align(16)) float m_floats[size];
* #else
* // Use _aligned_malloc to dynamically alocate the array.
* auto m_floats = (float*)_aligned_malloc(size * sizeof(float), 16);
* #endif
*
* // Note: _m128 types should not be accessed directly, they are
* // aligned on 16-byte boundaries.
*
* #ifdef DYNAMIC
* _aligned_free(m_floats);
* #endif
* }
*/
void compute_array_cpp(float *array1, // [in] first source array
float *array2, // [in] second source array
float *result, // [out] result array
int size) // [in] size of arrays
{
auto source1 = array1;
auto source2 = array2;
auto dest = result;
for (auto i=0; i<size; ++i) {
*dest = sqrt((*source1) * (*source2) + (*source2)
* (*source2)) + 0.5f;
source1++;
source2++;
dest++;
}
}
void compute_array_sse(float *array1, // [in] first source array
float *array2, // [in] second source array
float *result, // [out] result array
int size) // [in] size of arrays
{
__m128 m1, m2, m3, m4;
__m128 *source1 = (__m128*)array1;
__m128 *source2 = (__m128*)array2;
__m128 *dest = (__m128*)result;
__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0,1,2,3] = 0.5
for (auto i=0; i<size/4; ++i)
{
m1 = _mm_mul_ps(*source1, *source2); // m1 = (*source1) * (*source2)
m2 = _mm_mul_ps(*source2, *source2); // m2 = (*source2) * (*source2)
m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2
m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3)
*dest = _mm_add_ps(m4, m0_5); // *dest = m4 + 0.5
source1++;
source2++;
dest++;
}
}
int main()
{
using namespace std;
cout << "Allocating aligned array blocks\n";
auto size = 100000;
auto source1 = (float*)_aligned_malloc(size * sizeof(float), 16);
auto source2 = (float*)_aligned_malloc(size * sizeof(float), 16);
auto result_cpp = (float*)_aligned_malloc(size * sizeof(float), 16);
auto result_sse = (float*)_aligned_malloc(size * sizeof(float), 16);
cout << "Assigning random numbers to source arrays\n";
srand(int(omp_get_wtime()));
for (auto i=0; i<size; ++i) {
source1[i] = float(rand() % 1000 - 500);
source2[i] = float(rand() % 1000 - 500);
}
double start, end, time_cpp, time_sse;
cout << "Benchmarking standard cpp implementation: ";
start = omp_get_wtime();
compute_array_cpp(source1, source2, result_cpp, size);
end = omp_get_wtime();
time_cpp = end-start;
cout << time_cpp << " seconds\n";
cout << "Benchmarking sse implementation: ";
start = omp_get_wtime();
compute_array_sse(source1, source2, result_sse, size);
end = omp_get_wtime();
time_sse = end-start;
cout << time_sse << " seconds\n";
cout << time_cpp/time_sse << "x speedup\n";
cout << "Freeing aligned array blocks\n";
_aligned_free(source1);
_aligned_free(source2);
_aligned_free(result_cpp);
_aligned_free(result_sse);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment