Created
October 11, 2016 19:55
-
-
Save cwfitzgerald/97c7773422bdb1251d68ef6eb77e2046 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Fpt3Container { | |
private: | |
std::vector<float> x; | |
std::vector<float> y; | |
std::vector<float> z; | |
public: | |
Fpt3Container(std::vector<Fpt3>& val) { | |
x.reserve(val.size()); | |
y.reserve(val.size()); | |
z.reserve(val.size()); | |
for (size_t i = 0; i < val.size(); ++i) { | |
x.push_back(val[i].x); | |
y.push_back(val[i].y); | |
z.push_back(val[i].z); | |
} | |
} | |
explicit operator std::vector<Fpt3>() { | |
std::vector<Fpt3> ret; | |
ret.reserve(x.size()); | |
for (size_t i = 0; i < x.size(); ++i) { | |
ret.push_back(Fpt3{x[i], y[i], z[i]}); | |
} | |
return ret; | |
}; | |
void push_back(const Fpt3& val) { | |
x.push_back(val.x); | |
y.push_back(val.y); | |
z.push_back(val.z); | |
} | |
void clear() { | |
x.clear(); | |
y.clear(); | |
z.clear(); | |
} | |
void pop_back() { | |
x.pop_back(); | |
y.pop_back(); | |
z.pop_back(); | |
} | |
size_t size() { | |
return x.size(); | |
} | |
float* getXptr() { | |
return &x[0]; | |
} | |
float* getYptr() { | |
return &y[0]; | |
} | |
float* getZptr() { | |
return &z[0]; | |
} | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <vector> | |
#include <immintrin.h> | |
#include <tmmintrin.h> | |
#ifdef __AVX__ | |
#include <xmmintrin.h> | |
#endif | |
class Fpt3 { | |
public: | |
float x; | |
float y; | |
float z; | |
void Normalize(); | |
void FastNormalize(); | |
}; | |
class Fpt3Container { | |
private: | |
std::vector<float> x; | |
std::vector<float> y; | |
std::vector<float> z; | |
public: | |
Fpt3Container(std::vector<Fpt3>& val) { | |
x.reserve(val.size()); | |
y.reserve(val.size()); | |
z.reserve(val.size()); | |
for (size_t i = 0; i < val.size(); ++i) { | |
x.push_back(val[i].x); | |
y.push_back(val[i].y); | |
z.push_back(val[i].z); | |
} | |
} | |
explicit operator std::vector<Fpt3>() { | |
std::vector<Fpt3> ret; | |
ret.reserve(x.size()); | |
for (size_t i = 0; i < x.size(); ++i) { | |
ret.push_back(Fpt3{x[i], y[i], z[i]}); | |
} | |
return ret; | |
}; | |
void push_back(const Fpt3& val) { | |
x.push_back(val.x); | |
y.push_back(val.y); | |
z.push_back(val.z); | |
} | |
void clear() { | |
x.clear(); | |
y.clear(); | |
z.clear(); | |
} | |
void pop_back() { | |
x.pop_back(); | |
y.pop_back(); | |
z.pop_back(); | |
} | |
size_t size() { | |
return x.size(); | |
} | |
float* getXptr() { | |
return &x[0]; | |
} | |
float* getYptr() { | |
return &y[0]; | |
} | |
float* getZptr() { | |
return &z[0]; | |
} | |
}; | |
void Fpt3::Normalize() { | |
float t = sqrt(x * x + y * y + z * z); | |
if (t < 0.00000001) | |
t = 0.00000001f; // yea not gonna normalize correctly but at least it won't crash | |
t = 1 / t; | |
x *= t; | |
y *= t; | |
z *= t; | |
} | |
void Fpt3::FastNormalize() { | |
__m128 inputs = _mm_setr_ps(x, y, z, 0); | |
__m128 squared = _mm_mul_ps(inputs, inputs); | |
__m128 sum; | |
#ifdef __SSE3__ | |
sum = _mm_hadd_ps(squared, squared); | |
sum = _mm_hadd_ps(sum, sum); | |
#else | |
{ | |
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1)); | |
__m128 sums = _mm_add_ps(squared, shuf); | |
shuf = _mm_movehl_ps(shuf, sums); | |
sums = _mm_add_ss(sums, shuf); | |
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0)); | |
} | |
#endif | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = | |
_mm_add_ps(recip, _mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 normalized = _mm_mul_ps(inputs, recip); | |
float* normalized_ptr = reinterpret_cast<float*>(&normalized); | |
x = normalized_ptr[0]; | |
y = normalized_ptr[1]; | |
z = normalized_ptr[2]; | |
} | |
void slow_normalize(std::vector<Fpt3>& vec) { | |
for (size_t i = 0; i < vec.size(); ++i) { | |
vec[i].FastNormalize(); | |
} | |
} | |
void normalize(std::vector<Fpt3>& vec) { | |
static_assert(sizeof(Fpt3) == 12, ""); | |
size_t vec_size = vec.size(); | |
size_t parent_size = vec_size & -4ULL; | |
size_t i = 0; | |
for (; i < parent_size; i += 4) { | |
auto&& e1 = vec[i + 0]; | |
auto&& e2 = vec[i + 1]; | |
auto&& e3 = vec[i + 2]; | |
auto&& e4 = vec[i + 3]; | |
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x); | |
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y); | |
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z); | |
__m128 xsquared = _mm_mul_ps(xinputs, xinputs); | |
__m128 ysquared = _mm_mul_ps(yinputs, yinputs); | |
__m128 zsquared = _mm_mul_ps(zinputs, zinputs); | |
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared)); | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip, | |
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 xnormalized = _mm_mul_ps(recip, xinputs); | |
__m128 ynormalized = _mm_mul_ps(recip, yinputs); | |
__m128 znormalized = _mm_mul_ps(recip, zinputs); | |
float* xarray = reinterpret_cast<float*>(&xnormalized); | |
float* yarray = reinterpret_cast<float*>(&ynormalized); | |
float* zarray = reinterpret_cast<float*>(&znormalized); | |
e1.x = xarray[0]; | |
e2.x = xarray[1]; | |
e3.x = xarray[2]; | |
e4.x = xarray[3]; | |
e1.y = yarray[0]; | |
e2.y = yarray[1]; | |
e3.y = yarray[2]; | |
e4.y = yarray[3]; | |
e1.z = zarray[0]; | |
e2.z = zarray[1]; | |
e3.z = zarray[2]; | |
e4.z = zarray[3]; | |
} | |
for (; i < vec_size; ++i) { | |
vec[i].FastNormalize(); | |
} | |
} | |
#ifdef __AVX__ // __AVX__ | |
void normalize(Fpt3Container& vec) { | |
size_t vec_size = vec.size(); | |
size_t parent_size = vec_size & -9ULL; | |
size_t i = 0; | |
for (; i < parent_size; i += 8) { | |
__m256 xinputs = _mm256_load_ps(vec.getXptr() + i); | |
__m256 yinputs = _mm256_load_ps(vec.getYptr() + i); | |
__m256 zinputs = _mm256_load_ps(vec.getZptr() + i); | |
__m256 xsquared = _mm256_mul_ps(xinputs, xinputs); | |
__m256 ysquared = _mm256_mul_ps(yinputs, yinputs); | |
__m256 zsquared = _mm256_mul_ps(zinputs, zinputs); | |
__m256 sum = _mm256_add_ps(xsquared, _mm256_add_ps(ysquared, zsquared)); | |
__m256 recip; | |
#ifdef FPT3_FAST | |
recip = _mm256_rsqrt_ps(sum); | |
#else | |
{ | |
__m256 sqrt = _mm256_sqrt_ps(sum); | |
__m256 one = _mm256_set1_ps(1.0f); | |
recip = _mm256_div_ps(one, sqrt); | |
} | |
#endif | |
__m256 min = _mm256_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm256_blendv_ps(recip, min, _mm256_cmp_ps(min, recip, _CMP_GT_OS)); | |
#else | |
recip = _mm256_add_ps( | |
recip, _mm256_mul_ps(min, _mm256_and_ps(_mm256_set1_ps(1), | |
_mm256_cmp_ps(min, recip, _CMP_LE_OS)))); | |
#endif | |
__m256 xnormalized = _mm256_mul_ps(recip, xinputs); | |
__m256 ynormalized = _mm256_mul_ps(recip, yinputs); | |
__m256 znormalized = _mm256_mul_ps(recip, zinputs); | |
_mm256_store_ps(vec.getXptr() + i, xnormalized); | |
_mm256_store_ps(vec.getYptr() + i, ynormalized); | |
_mm256_store_ps(vec.getZptr() + i, znormalized); | |
} | |
for (; i < vec_size; ++i) { | |
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]}; | |
f.FastNormalize(); | |
*(vec.getXptr() + i) = f.x; | |
*(vec.getYptr() + i) = f.y; | |
*(vec.getZptr() + i) = f.z; | |
} | |
} | |
#else //! __AVX__ | |
void normalize(Fpt3Container& vec) { | |
size_t vec_size = vec.size(); | |
size_t parent_size = vec_size & -4ULL; | |
size_t i = 0; | |
for (; i < parent_size; i += 8) { | |
__m128 xinputs = _mm_load_ps(vec.getXptr() + i); | |
__m128 yinputs = _mm_load_ps(vec.getYptr() + i); | |
__m128 zinputs = _mm_load_ps(vec.getZptr() + i); | |
__m128 xsquared = _mm_mul_ps(xinputs, xinputs); | |
__m128 ysquared = _mm_mul_ps(yinputs, yinputs); | |
__m128 zsquared = _mm_mul_ps(zinputs, zinputs); | |
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared)); | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip, | |
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 xnormalized = _mm_mul_ps(recip, xinputs); | |
__m128 ynormalized = _mm_mul_ps(recip, yinputs); | |
__m128 znormalized = _mm_mul_ps(recip, zinputs); | |
_mm_store_ps(vec.getXptr() + i, xnormalized); | |
_mm_store_ps(vec.getYptr() + i, ynormalized); | |
_mm_store_ps(vec.getZptr() + i, znormalized); | |
} | |
for (; i < vec_size; ++i) { | |
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]}; | |
f.FastNormalize(); | |
*(vec.getXptr() + i) = f.x; | |
*(vec.getYptr() + i) = f.y; | |
*(vec.getZptr() + i) = f.z; | |
} | |
} | |
#endif // __AVX__ | |
#include <chrono> | |
#include <iostream> | |
template <class Func, class... Args, | |
typename std::enable_if_t< | |
std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value, | |
int> = 0> | |
auto time_function(size_t loops, Func&& f, Args&&... a) -> double { | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < loops; ++i) { | |
f(std::forward<Args>(a)...); | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
return (end - start).count() / 1'000'000.0; | |
} | |
template <class Func, class... Args, | |
typename std::enable_if_t< | |
!std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value, | |
int> = 0> | |
auto time_function(size_t loops, Func&& f, Args&&... a) | |
-> std::pair<double, decltype(f(std::forward<Args>(a)...))> { | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < loops - 1; ++i) { | |
f(std::forward<Args>(a)...); | |
} | |
auto ans = f(std::forward<Args>(a)...); | |
auto end = std::chrono::high_resolution_clock::now(); | |
return std::make_pair((end - start).count() / 1'000'000.0, ans); | |
} | |
void time_slow(size_t loops) { | |
Fpt3 f; | |
volatile int x = 0; | |
for (size_t i = 0; i < loops; ++i) { | |
f = Fpt3{static_cast<float>(i), static_cast<float>(i + 1), static_cast<float>(i + 2)}; | |
f.Normalize(); | |
x = f.x; | |
} | |
} | |
void time_fast(size_t loops) { | |
Fpt3 f; | |
volatile int x = 0; | |
for (size_t i = 0; i < loops; ++i) { | |
f = Fpt3{static_cast<float>(i), static_cast<float>(i + 1), static_cast<float>(i + 2)}; | |
f.FastNormalize(); | |
x = f.x; | |
} | |
} | |
#include <algorithm> | |
#include <cassert> | |
#include <iomanip> | |
int main() { | |
Fpt3 f; | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.Normalize(); | |
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n'; | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.FastNormalize(); | |
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n'; | |
constexpr size_t iterations = 100'000'000; | |
auto slow_time = time_function(1, time_slow, iterations); | |
std::cerr << std::fixed << std::setprecision(2) << slow_time << "ms " | |
<< slow_time / iterations * 1'000'000 << "ns per element\n"; | |
auto fast_time = time_function(1, time_fast, iterations); | |
std::cerr << std::fixed << std::setprecision(2) << fast_time << "ms " | |
<< fast_time / iterations * 1'000'000 << "ns per element\n"; | |
std::vector<Fpt3> vec_test(4, Fpt3{1.0, 2.0, 3.0}); | |
auto vec_test2 = Fpt3Container{vec_test}; | |
normalize(vec_test2); | |
vec_test = std::vector<Fpt3>{vec_test2}; | |
assert(vec_test[0].x == f.x && vec_test[0].y == f.y && vec_test[0].z == f.z); | |
assert(vec_test[1].x == f.x && vec_test[1].y == f.y && vec_test[1].z == f.z); | |
assert(vec_test[2].x == f.x && vec_test[2].y == f.y && vec_test[2].z == f.z); | |
assert(vec_test[3].x == f.x && vec_test[3].y == f.y && vec_test[3].z == f.z); | |
std::vector<Fpt3> array_test(iterations, Fpt3{1.0, 2.0, 3.0}); | |
auto slow_array = time_function(1, slow_normalize, array_test); | |
std::cerr << std::fixed << std::setprecision(2) << slow_array << "ms " | |
<< slow_array / iterations * 1'000'000 << "ns per element\n"; | |
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0}); | |
auto fast_array = time_function(1, [&]() { normalize(array_test); }); | |
std::cerr << std::fixed << std::setprecision(2) << fast_array << "ms " | |
<< fast_array / iterations * 1'000'000 << "ns per element\n"; | |
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0}); | |
Fpt3Container fpt3c{array_test}; | |
auto inline_array = time_function(1, [&]() { normalize(fpt3c); }); | |
std::cerr << std::fixed << std::setprecision(2) << inline_array << "ms " | |
<< inline_array / iterations * 1'000'000 << "ns per element\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef __AVX__ // __AVX__ | |
void normalize(Fpt3Container& vec) { | |
size_t vec_size = vec.size(); | |
size_t parent_size = vec_size & -9ULL; | |
size_t i = 0; | |
for (; i < parent_size; i += 8) { | |
__m256 xinputs = _mm256_load_ps(vec.getXptr() + i); | |
__m256 yinputs = _mm256_load_ps(vec.getYptr() + i); | |
__m256 zinputs = _mm256_load_ps(vec.getZptr() + i); | |
__m256 xsquared = _mm256_mul_ps(xinputs, xinputs); | |
__m256 ysquared = _mm256_mul_ps(yinputs, yinputs); | |
__m256 zsquared = _mm256_mul_ps(zinputs, zinputs); | |
__m256 sum = _mm256_add_ps(xsquared, _mm256_add_ps(ysquared, zsquared)); | |
__m256 recip; | |
#ifdef FPT3_FAST | |
recip = _mm256_rsqrt_ps(sum); | |
#else | |
{ | |
__m256 sqrt = _mm256_sqrt_ps(sum); | |
__m256 one = _mm256_set1_ps(1.0f); | |
recip = _mm256_div_ps(one, sqrt); | |
} | |
#endif | |
__m256 min = _mm256_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm256_blendv_ps(recip, min, _mm256_cmp_ps(min, recip, _CMP_GT_OS)); | |
#else | |
recip = _mm256_add_ps( | |
recip, _mm256_mul_ps(min, _mm256_and_ps(_mm256_set1_ps(1), | |
_mm256_cmp_ps(min, recip, _CMP_LE_OS)))); | |
#endif | |
__m256 xnormalized = _mm256_mul_ps(recip, xinputs); | |
__m256 ynormalized = _mm256_mul_ps(recip, yinputs); | |
__m256 znormalized = _mm256_mul_ps(recip, zinputs); | |
_mm256_store_ps(vec.getXptr() + i, xnormalized); | |
_mm256_store_ps(vec.getYptr() + i, ynormalized); | |
_mm256_store_ps(vec.getZptr() + i, znormalized); | |
} | |
for (; i < vec_size; ++i) { | |
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]}; | |
f.FastNormalize(); | |
*(vec.getXptr() + i) = f.x; | |
*(vec.getYptr() + i) = f.y; | |
*(vec.getZptr() + i) = f.z; | |
} | |
} | |
#else //! __AVX__ | |
void normalize(Fpt3Container& vec) { | |
size_t vec_size = vec.size(); | |
size_t parent_size = vec_size & -4ULL; | |
size_t i = 0; | |
for (; i < parent_size; i += 8) { | |
__m128 xinputs = _mm_load_ps(vec.getXptr() + i); | |
__m128 yinputs = _mm_load_ps(vec.getYptr() + i); | |
__m128 zinputs = _mm_load_ps(vec.getZptr() + i); | |
__m128 xsquared = _mm_mul_ps(xinputs, xinputs); | |
__m128 ysquared = _mm_mul_ps(yinputs, yinputs); | |
__m128 zsquared = _mm_mul_ps(zinputs, zinputs); | |
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared)); | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip, | |
_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1), _mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 xnormalized = _mm_mul_ps(recip, xinputs); | |
__m128 ynormalized = _mm_mul_ps(recip, yinputs); | |
__m128 znormalized = _mm_mul_ps(recip, zinputs); | |
_mm_store_ps(vec.getXptr() + i, xnormalized); | |
_mm_store_ps(vec.getYptr() + i, ynormalized); | |
_mm_store_ps(vec.getZptr() + i, znormalized); | |
} | |
for (; i < vec_size; ++i) { | |
auto f = Fpt3{vec.getXptr()[i], vec.getYptr()[i], vec.getZptr()[i]}; | |
f.FastNormalize(); | |
*(vec.getXptr() + i) = f.x; | |
*(vec.getYptr() + i) = f.y; | |
*(vec.getZptr() + i) = f.z; | |
} | |
} | |
#endif // __AVX__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment