Created
October 10, 2016 18:47
-
-
Save cwfitzgerald/a161182c46104db3b8caf45c6ae15726 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <tmmintrin.h> | |
#include <immintrin.h> | |
#include <smmintrin.h> | |
void Fpt3::FastNormalize() { | |
__m128 inputs = _mm_setr_ps(x, y, z, 0); | |
__m128 squared = _mm_mul_ps(inputs, inputs); | |
__m128 sum; | |
#ifdef __SSE3__ | |
sum = _mm_hadd_ps(squared, squared); | |
sum = _mm_hadd_ps(sum, sum); | |
#else | |
{ | |
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1)); | |
__m128 sums = _mm_add_ps(squared, shuf); | |
shuf = _mm_movehl_ps(shuf, sums); | |
sums = _mm_add_ss(sums, shuf); | |
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0)); | |
} | |
#endif | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 normalized = _mm_mul_ps(inputs, recip); | |
float* normalized_ptr = reinterpret_cast<float*>(&normalized); | |
x = normalized_ptr[0]; | |
y = normalized_ptr[1]; | |
z = normalized_ptr[2]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <tmmintrin.h> | |
#include <immintrin.h> | |
#include <smmintrin.h> | |
void normalize(std::vector<Fpt3>& vec) { | |
static_assert(sizeof(Fpt3) == 12, ""); | |
size_t vec_size = vec.size(); | |
size_t i = 0; | |
for (; i < vec_size; i += 4) { | |
auto&& e1 = vec[i + 0]; | |
auto&& e2 = vec[i + 1]; | |
auto&& e3 = vec[i + 2]; | |
auto&& e4 = vec[i + 3]; | |
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x); | |
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y); | |
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z); | |
__m128 xsquared = _mm_mul_ps(xinputs, xinputs); | |
__m128 ysquared = _mm_mul_ps(yinputs, yinputs); | |
__m128 zsquared = _mm_mul_ps(zinputs, zinputs); | |
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared)); | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 xnormalized = _mm_mul_ps(recip, xinputs); | |
__m128 ynormalized = _mm_mul_ps(recip, yinputs); | |
__m128 znormalized = _mm_mul_ps(recip, zinputs); | |
float* xarray = reinterpret_cast<float*>(&xnormalized); | |
float* yarray = reinterpret_cast<float*>(&ynormalized); | |
float* zarray = reinterpret_cast<float*>(&znormalized); | |
e1.x = xarray[0]; e2.x = xarray[1]; e3.x = xarray[2]; e4.x = xarray[3]; | |
e1.y = yarray[0]; e2.y = yarray[1]; e3.y = yarray[2]; e4.y = yarray[3]; | |
e1.z = zarray[0]; e2.z = zarray[1]; e3.z = zarray[2]; e4.z = zarray[3]; | |
} | |
for (; i < vec_size; ++i) { | |
vec[i].FastNormalize(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <immintrin.h> | |
#include <tmmintrin.h> | |
#include <vector> | |
class Fpt3 { | |
public: | |
volatile float x; | |
volatile float y; | |
volatile float z; | |
void Normalize(); | |
void FastNormalize(); | |
}; | |
void Fpt3::Normalize() { | |
float t = sqrt(x * x + y * y + z * z); | |
if (t < 0.00000001) | |
t = 0.00000001f; // yea not gonna normalize correctly but at least it won't crash | |
t = 1 / t; | |
x *= t; | |
y *= t; | |
z *= t; | |
} | |
void Fpt3::FastNormalize() { | |
__m128 inputs = _mm_setr_ps(x, y, z, 0); | |
__m128 squared = _mm_mul_ps(inputs, inputs); | |
__m128 sum; | |
#ifdef __SSE3__ | |
sum = _mm_hadd_ps(squared, squared); | |
sum = _mm_hadd_ps(sum, sum); | |
#else | |
{ | |
__m128 shuf = _mm_shuffle_ps(squared, squared, _MM_SHUFFLE(2, 3, 0, 1)); | |
__m128 sums = _mm_add_ps(squared, shuf); | |
shuf = _mm_movehl_ps(shuf, sums); | |
sums = _mm_add_ss(sums, shuf); | |
sum = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0, 0, 0, 0)); | |
} | |
#endif | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 normalized = _mm_mul_ps(inputs, recip); | |
float* normalized_ptr = reinterpret_cast<float*>(&normalized); | |
x = normalized_ptr[0]; | |
y = normalized_ptr[1]; | |
z = normalized_ptr[2]; | |
} | |
void slow_normalize(std::vector<Fpt3>& vec) { | |
for (size_t i = 0; i < vec.size(); ++i) { | |
vec[i].FastNormalize(); | |
} | |
} | |
void normalize(std::vector<Fpt3>& vec) { | |
static_assert(sizeof(Fpt3) == 12, ""); | |
size_t vec_size = vec.size(); | |
size_t i = 0; | |
for (; i < vec_size; i += 4) { | |
auto&& e1 = vec[i + 0]; | |
auto&& e2 = vec[i + 1]; | |
auto&& e3 = vec[i + 2]; | |
auto&& e4 = vec[i + 3]; | |
__m128 xinputs = _mm_setr_ps(e1.x, e2.x, e3.x, e4.x); | |
__m128 yinputs = _mm_setr_ps(e1.y, e2.y, e3.y, e4.y); | |
__m128 zinputs = _mm_setr_ps(e1.z, e2.z, e3.z, e4.z); | |
__m128 xsquared = _mm_mul_ps(xinputs, xinputs); | |
__m128 ysquared = _mm_mul_ps(yinputs, yinputs); | |
__m128 zsquared = _mm_mul_ps(zinputs, zinputs); | |
__m128 sum = _mm_add_ps(xsquared, _mm_add_ps(ysquared, zsquared)); | |
__m128 recip; | |
#ifdef FPT3_FAST | |
recip = _mm_rsqrt_ps(sum); | |
#else | |
{ | |
__m128 sqrt = _mm_sqrt_ps(sum); | |
__m128 one = _mm_set1_ps(1.0f); | |
recip = _mm_div_ps(one, sqrt); | |
} | |
#endif | |
__m128 min = _mm_set1_ps(0.00000001f); | |
#ifdef __SSE4_1__ | |
recip = _mm_blendv_ps(recip, min, _mm_cmpgt_ps(min, recip)); | |
#else | |
recip = _mm_add_ps(recip,_mm_mul_ps(min, _mm_and_ps(_mm_set1_ps(1),_mm_cmplt_ps(min, recip)))); | |
#endif | |
__m128 xnormalized = _mm_mul_ps(recip, xinputs); | |
__m128 ynormalized = _mm_mul_ps(recip, yinputs); | |
__m128 znormalized = _mm_mul_ps(recip, zinputs); | |
float* xarray = reinterpret_cast<float*>(&xnormalized); | |
float* yarray = reinterpret_cast<float*>(&ynormalized); | |
float* zarray = reinterpret_cast<float*>(&znormalized); | |
e1.x = xarray[0]; e2.x = xarray[1]; e3.x = xarray[2]; e4.x = xarray[3]; | |
e1.y = yarray[0]; e2.y = yarray[1]; e3.y = yarray[2]; e4.y = yarray[3]; | |
e1.z = zarray[0]; e2.z = zarray[1]; e3.z = zarray[2]; e4.z = zarray[3]; | |
} | |
for (; i < vec_size; ++i) { | |
vec[i].FastNormalize(); | |
} | |
} | |
#include <chrono> | |
#include <iostream> | |
template <class Func, class... Args, | |
typename std::enable_if_t<std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value, | |
int> = 0> | |
auto time_function(size_t loops, Func&& f, Args&&... a) -> double { | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < loops; ++i) { | |
f(std::forward<Args>(a)...); | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
return (end - start).count() / 1'000'000.0; | |
} | |
template <class Func, class... Args, | |
typename std::enable_if_t<!std::is_same<decltype(std::declval<Func>()(std::declval<Args>()...)), void>::value, | |
int> = 0> | |
auto time_function(size_t loops, Func&& f, Args&&... a) -> std::pair<double, decltype(f(std::forward<Args>(a)...))> { | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < loops - 1; ++i) { | |
f(std::forward<Args>(a)...); | |
} | |
auto ans = f(std::forward<Args>(a)...); | |
auto end = std::chrono::high_resolution_clock::now(); | |
return std::make_pair((end - start).count() / 1'000'000.0, ans); | |
} | |
void time_slow(size_t loops) { | |
Fpt3 f; | |
for (size_t i = 0; i < loops; ++i) { | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.Normalize(); | |
} | |
} | |
void time_fast(size_t loops) { | |
Fpt3 f; | |
for (size_t i = 0; i < loops; ++i) { | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.FastNormalize(); | |
} | |
} | |
#include <algorithm> | |
#include <cassert> | |
int main() { | |
Fpt3 f; | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.Normalize(); | |
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n'; | |
f = Fpt3{1.0, 2.0, 3.0}; | |
f.FastNormalize(); | |
std::cerr << f.x << ", " << f.y << ", " << f.z << '\n'; | |
auto slow_time = time_function(1, time_slow, 100'000'000); | |
std::cerr << slow_time << "ms\n"; | |
auto fast_time = time_function(1, time_fast, 100'000'000); | |
std::cerr << fast_time << "ms " << slow_time/fast_time << "x faster per element\n"; | |
std::vector<Fpt3> vec_test(4, Fpt3{1.0, 2.0, 3.0}); | |
normalize(vec_test); | |
assert(vec_test[0].x == f.x && vec_test[0].y == f.y && vec_test[0].z == f.z); | |
assert(vec_test[1].x == f.x && vec_test[1].y == f.y && vec_test[1].z == f.z); | |
assert(vec_test[2].x == f.x && vec_test[2].y == f.y && vec_test[2].z == f.z); | |
assert(vec_test[3].x == f.x && vec_test[3].y == f.y && vec_test[3].z == f.z); | |
std::vector<Fpt3> array_test(100'000'000, Fpt3{1.0, 2.0, 3.0}); | |
auto slow_array = time_function(1, slow_normalize, array_test); | |
std::cerr << slow_array << "ms\n"; | |
std::fill(array_test.begin(), array_test.end(), Fpt3{1.0, 2.0, 3.0}); | |
auto fast_array = time_function(1, normalize, array_test); | |
std::cerr << fast_array << "ms " << slow_time/fast_array << "x faster per element\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment