Skip to content

Instantly share code, notes, and snippets.

@cwfitzgerald
Created October 11, 2016 19:55
Show Gist options
  • Save cwfitzgerald/97c7773422bdb1251d68ef6eb77e2046 to your computer and use it in GitHub Desktop.
Save cwfitzgerald/97c7773422bdb1251d68ef6eb77e2046 to your computer and use it in GitHub Desktop.
class Fpt3Container {
private:
std::vector<float> x;
std::vector<float> y;
std::vector<float> z;
public:
Fpt3Container(std::vector<Fpt3>& val) {
x.reserve(val.size());
y.reserve(val.size());
z.reserve(val.size());
for (size_t i = 0; i < val.size(); ++i) {
x.push_back(val[i].x);
y.push_back(val[i].y);
z.push_back(val[i].z);
}
}
explicit operator std::vector<Fpt3>() {
std::vector<Fpt3> ret;
ret.reserve(x.size());
for (size_t i = 0; i < x.size(); ++i) {
ret.push_back(Fpt3{x[i], y[i], z[i]});
}
return ret;
};
void push_back(const Fpt3& val) {
x.push_back(val.x);
y.push_back(val.y);
z.push_back(val.z);
}
void clear() {
x.clear();
y.clear();
z.clear();
}
void pop_back() {
x.pop_back();
y.pop_back();
z.pop_back();
}
size_t size() {
return x.size();
}
float* getXptr() {
return &x[0];
}
float* getYptr() {
return &y[0];
}
float* getZptr() {
return &z[0];
}
};
#include <cmath>
#include <vector>
#include <immintrin.h>
#include <tmmintrin.h>
#ifdef __AVX__
#include <xmmintrin.h>
#endif
class Fpt3 {
public:
float x;
float y;
float z;
void Normalize();
void FastNormalize();
};
class Fpt3Container {
private:
std::vector<float> x;
std::vector<float> y;
std::vector<float> z;
public:
Fpt3Container(std::vector<Fpt3>& val) {
x.reserve(val.size());
y.reserve(val.size());
z.reserve(val.size());
for (size_t i = 0; i < val.size(); ++i) {
x.push_back(val[i].x);
y.push_back(val[i].y);
z.push_back(val[i].z);
}
}
explicit operator std::vector<Fpt3>() {
std::vector<Fpt3> ret;
ret.reserve(x.size());
for (size_t i = 0; i < x.size(); ++i) {
ret.push_back(Fpt3{x[i], y[i], z[i]});
}
return ret;
};
void push_back(const Fpt3& val) {
x.push_back(val.x);
y.push_back(val.y);
z.push_back(val.z);
}
void clear() {
x.clear();
y.clear();
z.clear();
}
void pop_back() {
x.pop_back();
y.pop_back();
z.pop_back();
}
size_t size() {
return x.size();
}
float* getXptr() {
return &x[0];
}
float* getYptr() {
return &y[0];
}
float* getZptr() {
return &z[0];
}
};
void Fpt3::Normalize() {
float t = sqrt(x * x + y * y + z * z);
if (t < 0.00000001)
t = 0.00000001f; // yea not gonna normalize correctly but at least it won't crash
t = 1 / t;
x *= t;
y *= t;
z *= t;
}
void Fpt3::FastNormalize() {
__m128 inputs = _mm_setr_ps(x, y, z, 0);
__m128 squared = _mm_mul_ps(inputs, inputs);
__m128 sum;
#ifdef __SSE3__
sum = _mm_hadd_ps(squared, squared);
sum = _mm_hadd_ps(sum, sum);
#else
{
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(squared, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0));
}
#endif
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip =
_mm_add_ps(recip, _mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip))));
#endif
__m128 normalized = _mm_mul_ps(inputs, recip);
float* normalized_ptr = reinterpret_cast<float*>(&normalized);
x = normalized_ptr[0];
y = normalized_ptr[1];
z = normalized_ptr[2];
}
void slow_normalize(std::vector<Fpt3>& vec) {
for (size_t i = 0; i < vec.size(); ++i) {
vec[i].FastNormalize();
}
}
void normalize(std::vector<Fpt3>& vec) {
static_assert(sizeof(Fpt3) == 12, "");
size_t vec_size = vec.size();
size_t parent_size = vec_size & -4ULL;
size_t i = 0;
for (; i < parent_size; i += 4) {
auto&& e1 = vec[i + 0];
auto&& e2 = vec[i + 1];
auto&& e3 = vec[i + 2];
auto&& e4 = vec[i + 3];
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x);
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y);
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z);
__m128 xsquared = _mm_mul_ps(xinputs, xinputs);
__m128 ysquared = _mm_mul_ps(yinputs, yinputs);
__m128 zsquared = _mm_mul_ps(zinputs, zinputs);
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared));
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip))));
#endif
__m128 xnormalized = _mm_mul_ps(recip, xinputs);
__m128 ynormalized = _mm_mul_ps(recip, yinputs);
__m128 znormalized = _mm_mul_ps(recip, zinputs);
float* xarray = reinterpret_cast<float*>(&xnormalized);
float* yarray = reinterpret_cast<float*>(&ynormalized);
float* zarray = reinterpret_cast<float*>(&znormalized);
e1.x = xarray[0];
e2.x = xarray[1];
e3.x = xarray[2];
e4.x = xarray[3];
e1.y = yarray[0];
e2.y = yarray[1];
e3.y = yarray[2];
e4.y = yarray[3];
e1.z = zarray[0];
e2.z = zarray[1];
e3.z = zarray[2];
e4.z = zarray[3];
}
for (; i < vec_size; ++i) {
vec[i].FastNormalize();
}
}
#ifdef __AVX__ // __AVX__
void normalize(Fpt3Container& vec) {
size_t vec_size = vec.size();
size_t parent_size = vec_size & -9ULL;
size_t i = 0;
for (; i < parent_size; i += 8) {
__m256 xinputs = _mm256_load_ps(vec.getXptr() + i);
__m256 yinputs = _mm256_load_ps(vec.getYptr() + i);
__m256 zinputs = _mm256_load_ps(vec.getZptr() + i);
__m256 xsquared = _mm256_mul_ps(xinputs, xinputs);
__m256 ysquared = _mm256_mul_ps(yinputs, yinputs);
__m256 zsquared = _mm256_mul_ps(zinputs, zinputs);
__m256 sum = _mm256_add_ps(xsquared, _mm256_add_ps(ysquared, zsquared));
__m256 recip;
#ifdef FPT3_FAST
recip = _mm256_rsqrt_ps(sum);
#else
{
__m256 sqrt = _mm256_sqrt_ps(sum);
__m256 one = _mm256_set1_ps(1.0f);
recip = _mm256_div_ps(one, sqrt);
}
#endif
__m256 min = _mm256_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm256_blendv_ps(recip, min, _mm256_cmp_ps(min, recip, _CMP_GT_OS));
#else
recip = _mm256_add_ps(
recip, _mm256_mul_ps(min, _mm256_and_ps(_mm256_set1_ps(1),
_mm256_cmp_ps(min, recip, _CMP_LE_OS))));
#endif
__m256 xnormalized = _mm256_mul_ps(recip, xinputs);
__m256 ynormalized = _mm256_mul_ps(recip, yinputs);
__m256 znormalized = _mm256_mul_ps(recip, zinputs);
_mm256_store_ps(vec.getXptr() + i, xnormalized);
_mm256_store_ps(vec.getYptr() + i, ynormalized);
_mm256_store_ps(vec.getZptr() + i, znormalized);
}
for (; i < vec_size; ++i) {
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]};
f.FastNormalize();
*(vec.getXptr() + i) = f.x;
*(vec.getYptr() + i) = f.y;
*(vec.getZptr() + i) = f.z;
}
}
#else //! __AVX__
void normalize(Fpt3Container& vec) {
size_t vec_size = vec.size();
size_t parent_size = vec_size & -4ULL;
size_t i = 0;
for (; i < parent_size; i += 8) {
__m128 xinputs = _mm_load_ps(vec.getXptr() + i);
__m128 yinputs = _mm_load_ps(vec.getYptr() + i);
__m128 zinputs = _mm_load_ps(vec.getZptr() + i);
__m128 xsquared = _mm_mul_ps(xinputs, xinputs);
__m128 ysquared = _mm_mul_ps(yinputs, yinputs);
__m128 zsquared = _mm_mul_ps(zinputs, zinputs);
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared));
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip))));
#endif
__m128 xnormalized = _mm_mul_ps(recip, xinputs);
__m128 ynormalized = _mm_mul_ps(recip, yinputs);
__m128 znormalized = _mm_mul_ps(recip, zinputs);
_mm_store_ps(vec.getXptr() + i, xnormalized);
_mm_store_ps(vec.getYptr() + i, ynormalized);
_mm_store_ps(vec.getZptr() + i, znormalized);
}
for (; i < vec_size; ++i) {
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]};
f.FastNormalize();
*(vec.getXptr() + i) = f.x;
*(vec.getYptr() + i) = f.y;
*(vec.getZptr() + i) = f.z;
}
}
#endif // __AVX__
#include <chrono>
#include <iostream>
template <class Func, class... Args,
typename std::enable_if_t<
std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value,
int> = 0>
auto time_function(size_t loops, Func&& f, Args&&... a) -> double {
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < loops; ++i) {
f(std::forward<Args>(a)...);
}
auto end = std::chrono::high_resolution_clock::now();
return (end - start).count() / 1'000'000.0;
}
template <class Func, class... Args,
typename std::enable_if_t<
!std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value,
int> = 0>
auto time_function(size_t loops, Func&& f, Args&&... a)
-> std::pair<double, decltype(f(std::forward<Args>(a)...))> {
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < loops - 1; ++i) {
f(std::forward<Args>(a)...);
}
auto ans = f(std::forward<Args>(a)...);
auto end = std::chrono::high_resolution_clock::now();
return std::make_pair((end - start).count() / 1'000'000.0, ans);
}
void time_slow(size_t loops) {
Fpt3 f;
volatile int x = 0;
for (size_t i = 0; i < loops; ++i) {
f = Fpt3{static_cast<float>(i), static_cast<float>(i + 1), static_cast<float>(i + 2)};
f.Normalize();
x = f.x;
}
}
void time_fast(size_t loops) {
Fpt3 f;
volatile int x = 0;
for (size_t i = 0; i < loops; ++i) {
f = Fpt3{static_cast<float>(i), static_cast<float>(i + 1), static_cast<float>(i + 2)};
f.FastNormalize();
x = f.x;
}
}
#include <algorithm>
#include <cassert>
#include <iomanip>
int main() {
Fpt3 f;
f = Fpt3{1.0, 2.0, 3.0};
f.Normalize();
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n';
f = Fpt3{1.0, 2.0, 3.0};
f.FastNormalize();
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n';
constexpr size_t iterations = 100'000'000;
auto slow_time = time_function(1, time_slow, iterations);
std::cerr << std::fixed << std::setprecision(2) << slow_time << "ms "
<< slow_time / iterations * 1'000'000 << "ns per element\n";
auto fast_time = time_function(1, time_fast, iterations);
std::cerr << std::fixed << std::setprecision(2) << fast_time << "ms "
<< fast_time / iterations * 1'000'000 << "ns per element\n";
std::vector<Fpt3> vec_test(4, Fpt3{1.0, 2.0, 3.0});
auto vec_test2 = Fpt3Container{vec_test};
normalize(vec_test2);
vec_test = std::vector<Fpt3>{vec_test2};
assert(vec_test[0].x == f.x && vec_test[0].y == f.y && vec_test[0].z == f.z);
assert(vec_test[1].x == f.x && vec_test[1].y == f.y && vec_test[1].z == f.z);
assert(vec_test[2].x == f.x && vec_test[2].y == f.y && vec_test[2].z == f.z);
assert(vec_test[3].x == f.x && vec_test[3].y == f.y && vec_test[3].z == f.z);
std::vector<Fpt3> array_test(iterations, Fpt3{1.0, 2.0, 3.0});
auto slow_array = time_function(1, slow_normalize, array_test);
std::cerr << std::fixed << std::setprecision(2) << slow_array << "ms "
<< slow_array / iterations * 1'000'000 << "ns per element\n";
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0});
auto fast_array = time_function(1, [&]() { normalize(array_test); });
std::cerr << std::fixed << std::setprecision(2) << fast_array << "ms "
<< fast_array / iterations * 1'000'000 << "ns per element\n";
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0});
Fpt3Container fpt3c{array_test};
auto inline_array = time_function(1, [&]() { normalize(fpt3c); });
std::cerr << std::fixed << std::setprecision(2) << inline_array << "ms "
<< inline_array / iterations * 1'000'000 << "ns per element\n";
}
#ifdef __AVX__ // __AVX__
void normalize(Fpt3Container& vec) {
size_t vec_size = vec.size();
size_t parent_size = vec_size & -9ULL;
size_t i = 0;
for (; i < parent_size; i += 8) {
__m256 xinputs = _mm256_load_ps(vec.getXptr() + i);
__m256 yinputs = _mm256_load_ps(vec.getYptr() + i);
__m256 zinputs = _mm256_load_ps(vec.getZptr() + i);
__m256 xsquared = _mm256_mul_ps(xinputs, xinputs);
__m256 ysquared = _mm256_mul_ps(yinputs, yinputs);
__m256 zsquared = _mm256_mul_ps(zinputs, zinputs);
__m256 sum = _mm256_add_ps(xsquared, _mm256_add_ps(ysquared, zsquared));
__m256 recip;
#ifdef FPT3_FAST
recip = _mm256_rsqrt_ps(sum);
#else
{
__m256 sqrt = _mm256_sqrt_ps(sum);
__m256 one = _mm256_set1_ps(1.0f);
recip = _mm256_div_ps(one, sqrt);
}
#endif
__m256 min = _mm256_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm256_blendv_ps(recip, min, _mm256_cmp_ps(min, recip, _CMP_GT_OS));
#else
recip = _mm256_add_ps(
recip, _mm256_mul_ps(min, _mm256_and_ps(_mm256_set1_ps(1),
_mm256_cmp_ps(min, recip, _CMP_LE_OS))));
#endif
__m256 xnormalized = _mm256_mul_ps(recip, xinputs);
__m256 ynormalized = _mm256_mul_ps(recip, yinputs);
__m256 znormalized = _mm256_mul_ps(recip, zinputs);
_mm256_store_ps(vec.getXptr() + i, xnormalized);
_mm256_store_ps(vec.getYptr() + i, ynormalized);
_mm256_store_ps(vec.getZptr() + i, znormalized);
}
for (; i < vec_size; ++i) {
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]};
f.FastNormalize();
*(vec.getXptr() + i) = f.x;
*(vec.getYptr() + i) = f.y;
*(vec.getZptr() + i) = f.z;
}
}
#else //! __AVX__
void normalize(Fpt3Container& vec) {
size_t vec_size = vec.size();
size_t parent_size = vec_size & -4ULL;
size_t i = 0;
for (; i < parent_size; i += 8) {
__m128 xinputs = _mm_load_ps(vec.getXptr() + i);
__m128 yinputs = _mm_load_ps(vec.getYptr() + i);
__m128 zinputs = _mm_load_ps(vec.getZptr() + i);
__m128 xsquared = _mm_mul_ps(xinputs, xinputs);
__m128 ysquared = _mm_mul_ps(yinputs, yinputs);
__m128 zsquared = _mm_mul_ps(zinputs, zinputs);
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared));
__m128 recip;
#ifdef FPT3_FAST
recip = _mm_rsqrt_ps(sum);
#else
{
__m128 sqrt = _mm_sqrt_ps(sum);
__m128 one = _mm_set1_ps(1.0f);
recip = _mm_div_ps(one, sqrt);
}
#endif
__m128 min = _mm_set1_ps(0.00000001f);
#ifdef __SSE4_1__
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip));
#else
recip = _mm_add_ps(recip,
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip))));
#endif
__m128 xnormalized = _mm_mul_ps(recip, xinputs);
__m128 ynormalized = _mm_mul_ps(recip, yinputs);
__m128 znormalized = _mm_mul_ps(recip, zinputs);
_mm_store_ps(vec.getXptr() + i, xnormalized);
_mm_store_ps(vec.getYptr() + i, ynormalized);
_mm_store_ps(vec.getZptr() + i, znormalized);
}
for (; i < vec_size; ++i) {
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]};
f.FastNormalize();
*(vec.getXptr() + i) = f.x;
*(vec.getYptr() + i) = f.y;
*(vec.getZptr() + i) = f.z;
}
}
#endif // __AVX__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment