Skip to content

Instantly share code, notes, and snippets.

@cwfitzgerald
Created October 10, 2016 18:47
Show Gist options
  • Save cwfitzgerald/a161182c46104db3b8caf45c6ae15726 to your computer and use it in GitHub Desktop.
Save cwfitzgerald/a161182c46104db3b8caf45c6ae15726 to your computer and use it in GitHub Desktop.
#include <tmmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
void Fpt3::FastNormalize() {
__m128 inputs = _mm_setr_ps(x, y, z, 0);
__m128 squared = _mm_mul_ps(inputs, inputs);
__m128 sum;
#ifdef __SSE3__
sum = _mm_hadd_ps(squared, squared);
sum = _mm_hadd_ps(sum, sum);
#else
{
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(squared, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0));
}
#endif
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip))));
#endif
__m128 normalized = _mm_mul_ps(inputs, recip);
float* normalized_ptr = reinterpret_cast<float*>(&normalized);
x = normalized_ptr[0];
y = normalized_ptr[1];
z = normalized_ptr[2];
}
#include <vector>
#include <tmmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
void normalize(std::vector<Fpt3>& vec) {
static_assert(sizeof(Fpt3) == 12, "");
size_t vec_size = vec.size();
size_t i = 0;
for (; i < vec_size; i += 4) {
auto&& e1 = vec[i + 0];
auto&& e2 = vec[i + 1];
auto&& e3 = vec[i + 2];
auto&& e4 = vec[i + 3];
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x);
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y);
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z);
__m128 xsquared = _mm_mul_ps(xinputs, xinputs);
__m128 ysquared = _mm_mul_ps(yinputs, yinputs);
__m128 zsquared = _mm_mul_ps(zinputs, zinputs);
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared));
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip))));
#endif
__m128 xnormalized = _mm_mul_ps(recip, xinputs);
__m128 ynormalized = _mm_mul_ps(recip, yinputs);
__m128 znormalized = _mm_mul_ps(recip, zinputs);
float* xarray = reinterpret_cast<float*>(&xnormalized);
float* yarray = reinterpret_cast<float*>(&ynormalized);
float* zarray = reinterpret_cast<float*>(&znormalized);
e1.x = xarray[0]; e2.x = xarray[1]; e3.x = xarray[2]; e4.x = xarray[3];
e1.y = yarray[0]; e2.y = yarray[1]; e3.y = yarray[2]; e4.y = yarray[3];
e1.z = zarray[0]; e2.z = zarray[1]; e3.z = zarray[2]; e4.z = zarray[3];
}
for (; i < vec_size; ++i) {
vec[i].FastNormalize();
}
}
#include <cmath>
#include <immintrin.h>
#include <tmmintrin.h>
#include <vector>
class Fpt3 {
public:
volatile float x;
volatile float y;
volatile float z;
void Normalize();
void FastNormalize();
};
void Fpt3::Normalize() {
float t = sqrt(x * x + y * y + z * z);
if (t < 0.00000001)
t = 0.00000001f; // yea not gonna normalize correctly but at least it won't crash
t = 1 / t;
x *= t;
y *= t;
z *= t;
}
void Fpt3::FastNormalize() {
__m128 inputs = _mm_setr_ps(x, y, z, 0);
__m128 squared = _mm_mul_ps(inputs, inputs);
__m128 sum;
#ifdef __SSE3__
sum = _mm_hadd_ps(squared, squared);
sum = _mm_hadd_ps(sum, sum);
#else
{
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(squared, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0));
}
#endif
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip))));
#endif
__m128 normalized = _mm_mul_ps(inputs, recip);
float* normalized_ptr = reinterpret_cast<float*>(&normalized);
x = normalized_ptr[0];
y = normalized_ptr[1];
z = normalized_ptr[2];
}
void slow_normalize(std::vector<Fpt3>& vec) {
for (size_t i = 0; i < vec.size(); ++i) {
vec[i].FastNormalize();
}
}
void normalize(std::vector<Fpt3>& vec) {
static_assert(sizeof(Fpt3) == 12, "");
size_t vec_size = vec.size();
size_t i = 0;
for (; i < vec_size; i += 4) {
auto&& e1 = vec[i + 0];
auto&& e2 = vec[i + 1];
auto&& e3 = vec[i + 2];
auto&& e4 = vec[i + 3];
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x);
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y);
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z);
__m128 xsquared = _mm_mul_ps(xinputs, xinputs);
__m128 ysquared = _mm_mul_ps(yinputs, yinputs);
__m128 zsquared = _mm_mul_ps(zinputs, zinputs);
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared));
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip))));
#endif
__m128 xnormalized = _mm_mul_ps(recip, xinputs);
__m128 ynormalized = _mm_mul_ps(recip, yinputs);
__m128 znormalized = _mm_mul_ps(recip, zinputs);
float* xarray = reinterpret_cast<float*>(&xnormalized);
float* yarray = reinterpret_cast<float*>(&ynormalized);
float* zarray = reinterpret_cast<float*>(&znormalized);
e1.x = xarray[0]; e2.x = xarray[1]; e3.x = xarray[2]; e4.x = xarray[3];
e1.y = yarray[0]; e2.y = yarray[1]; e3.y = yarray[2]; e4.y = yarray[3];
e1.z = zarray[0]; e2.z = zarray[1]; e3.z = zarray[2]; e4.z = zarray[3];
}
for (; i < vec_size; ++i) {
vec[i].FastNormalize();
}
}
#include <chrono>
#include <iostream>
template <class Func, class... Args,
typename std::enable_if_t<std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value,
int> = 0>
auto time_function(size_t loops, Func&& f, Args&&... a) -> double {
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < loops; ++i) {
f(std::forward<Args>(a)...);
}
auto end = std::chrono::high_resolution_clock::now();
return (end - start).count() / 1'000'000.0;
}
template <class Func, class... Args,
typename std::enable_if_t<!std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value,
int> = 0>
auto time_function(size_t loops, Func&& f, Args&&... a) -> std::pair<double, decltype(f(std::forward<Args>(a)...))> {
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < loops - 1; ++i) {
f(std::forward<Args>(a)...);
}
auto ans = f(std::forward<Args>(a)...);
auto end = std::chrono::high_resolution_clock::now();
return std::make_pair((end - start).count() / 1'000'000.0, ans);
}
void time_slow(size_t loops) {
Fpt3 f;
for (size_t i = 0; i < loops; ++i) {
f = Fpt3{1.0, 2.0, 3.0};
f.Normalize();
}
}
void time_fast(size_t loops) {
Fpt3 f;
for (size_t i = 0; i < loops; ++i) {
f = Fpt3{1.0, 2.0, 3.0};
f.FastNormalize();
}
}
#include <algorithm>
#include <cassert>
int main() {
Fpt3 f;
f = Fpt3{1.0, 2.0, 3.0};
f.Normalize();
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n';
f = Fpt3{1.0, 2.0, 3.0};
f.FastNormalize();
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n';
auto slow_time = time_function(1, time_slow, 100'000'000);
std::cerr << slow_time << "ms\n";
auto fast_time = time_function(1, time_fast, 100'000'000);
std::cerr << fast_time << "ms " << slow_time/fast_time << "x faster per element\n";
std::vector<Fpt3> vec_test(4, Fpt3{1.0, 2.0, 3.0});
normalize(vec_test);
assert(vec_test[0].x == f.x && vec_test[0].y == f.y && vec_test[0].z == f.z);
assert(vec_test[1].x == f.x && vec_test[1].y == f.y && vec_test[1].z == f.z);
assert(vec_test[2].x == f.x && vec_test[2].y == f.y && vec_test[2].z == f.z);
assert(vec_test[3].x == f.x && vec_test[3].y == f.y && vec_test[3].z == f.z);
std::vector<Fpt3> array_test(100'000'000, Fpt3{1.0, 2.0, 3.0});
auto slow_array = time_function(1, slow_normalize, array_test);
std::cerr << slow_array << "ms\n";
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0});
auto fast_array = time_function(1, normalize, array_test);
std::cerr << fast_array << "ms " << slow_time/fast_array << "x faster per element\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment