Last active
November 25, 2021 11:04
-
-
Save bitonic/1ea0142a485ccdf5b3cc50b5b9e31ce1 to your computer and use it in GitHub Desktop.
Vectorized acos and atan2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Vectorized (SSE and AVX, through Eigen) atan2 and acos. | |
// | |
// Care has been taken to ensure that the vectorized results are identical to the non-vectorized | |
// ones. | |
// | |
// Original functions from <https://developer.download.nvidia.com/cg/index_stdlib.html>, adapted | |
// to work in bulk. Both seem to be very accurate (only tested that error < 0.01 | |
// degrees, which is what I need). | |
#include <Eigen/Core> | |
using namespace Eigen; | |
using namespace std; | |
#if defined(EIGEN_VECTORIZE_SSE) || defined(EIGEN_VECTORIZE_AVX) | |
#define TRIG_APPROX_VECTORIZED | |
#endif | |
namespace trig_approx_utils { | |
// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity. | |
template<typename Packet> | |
inline Packet negate_mask(); | |
#ifdef EIGEN_VECTORIZE_SSE | |
template<> | |
inline __m128 negate_mask() { | |
return _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); | |
} | |
#endif | |
#ifdef EIGEN_VECTORIZE_AVX | |
template<> | |
inline __m256 negate_mask() { | |
return _mm256_castsi256_ps( | |
_mm256_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000) | |
); | |
} | |
#endif | |
template<typename Packet> | |
inline Packet cmp_gt(const Packet& a, const Packet& b); | |
#ifdef EIGEN_VECTORIZE_SSE | |
template<> | |
inline __m128 cmp_gt(const __m128& a, const __m128& b) { | |
return _mm_cmpgt_ps(a, b); | |
} | |
#endif | |
#ifdef EIGEN_VECTORIZE_AVX | |
template<> | |
inline __m256 cmp_gt(const __m256& a, const __m256& b) { | |
return _mm256_cmp_ps(a, b, _CMP_GT_OQ); | |
} | |
#endif | |
template<typename Packet> | |
inline Packet cmp_lt(const Packet& a, const Packet& b); | |
#ifdef EIGEN_VECTORIZE_SSE | |
template<> | |
inline __m128 cmp_lt(const __m128& a, const __m128& b) { | |
return _mm_cmplt_ps(a, b); | |
} | |
#endif | |
#ifdef EIGEN_VECTORIZE_AVX | |
template<> | |
inline __m256 cmp_lt(const __m256& a, const __m256& b) { | |
return _mm256_cmp_ps(a, b, _CMP_LT_OQ); | |
} | |
#endif | |
// We need floats, M_PI will not be float since it's lacking the trailing f | |
constexpr float pi = 3.14159265359f; | |
constexpr float pi_halved = 1.57079632679f; | |
} | |
// Handbook of Mathematical Functions | |
// M. Abramowitz and I.A. Stegun, Ed. | |
// Absolute error <= 6.7e-5 | |
struct approx_acos_op { | |
inline float operator()(float x) const { | |
using namespace trig_approx_utils; | |
float negate = static_cast<float>(x < 0); | |
x = abs(x); | |
float ret; | |
ret = -0.0187293f; | |
ret = fma(ret, x, 0.0742610f); | |
ret = fma(ret, x, -0.2121144f); | |
ret = fma(ret, x, 1.5707288f); | |
ret = ret * sqrt(1.0f - x); | |
ret = ret - 2.0f * negate * ret; | |
return fma(negate, pi, ret); | |
} | |
#ifdef TRIG_APPROX_VECTORIZED | |
template<typename Packet> | |
inline const Packet packetOp(const Packet& x0) const { | |
using namespace internal; | |
using namespace trig_approx_utils; | |
// float negate = static_cast<float>(x < 0); | |
Packet negate = cmp_lt(x0, pset1<Packet>(0.0f)); | |
negate = pand(negate, pset1<Packet>(1.0f)); | |
Packet x = pabs(x0); // x = abs(x); | |
Packet ret = pset1<Packet>(-0.0187293f); // float ret = -0.0187293f; | |
// ret = fma(ret, x, 0.0742610f); | |
ret = pmadd(ret, x, pset1<Packet>(0.0742610f)); | |
// ret = fma(ret, x, -0.2121144f); | |
ret = pmadd(ret, x, pset1<Packet>(-0.2121144f)); | |
// ret = fma(ret, x, 1.5707288f); | |
ret = pmadd(ret, x, pset1<Packet>(1.5707288f)); | |
// ret = ret * sqrt(1.0f-x); | |
ret = pmul(ret, psqrt(psub(pset1<Packet>(1.0f), x))); | |
// ret = ret - 2.0f * negate * ret; | |
ret = psub(ret, pmul(pmul(pset1<Packet>(2.0f), negate), ret)); | |
// return fma(negate, pi, ret); | |
return pmadd(negate, pset1<Packet>(pi), ret); | |
} | |
}; | |
#endif | |
namespace Eigen { | |
namespace internal { | |
template<> | |
struct functor_traits<approx_acos_op> { | |
enum { | |
// More or less... | |
Cost = NumTraits<float>::MulCost * 5, | |
#ifdef TRIG_APPROX_VECTORIZED | |
// Vectorize please! Note that we instruct to vectorize regardless of what the packet | |
// type might be on the platform we compile this, because we want to make sure to get | |
// a compile error later on if we do not have code for that platform. | |
PacketAccess = 1 | |
#else | |
PacketAccess = 0 | |
#endif | |
}; | |
}; | |
} | |
} | |
template<typename Derived> | |
CwiseUnaryOp<approx_acos_op, const Derived> approxAcos(const ArrayBase<Derived>& xs) { | |
return xs.template unaryExpr<approx_acos_op>(); | |
} | |
struct approx_atan2_op { | |
inline float operator()(float y, float x) const { | |
using namespace trig_approx_utils; | |
// The original code had an unused `float t2`, no idea why. | |
float t0, t1, t3, t4; | |
t3 = abs(x); | |
t1 = abs(y); | |
t0 = fmax(t3, t1); | |
t1 = fmin(t3, t1); | |
t3 = 1.0f / t0; | |
t3 = t1 * t3; | |
t4 = t3 * t3; | |
t0 = -0.013480470f; | |
t0 = fma(t0, t4, 0.057477314f); | |
t0 = fma(t0, t4, -0.121239071f); | |
t0 = fma(t0, t4, 0.195635925f); | |
t0 = fma(t0, t4, -0.332994597f); | |
t0 = fma(t0, t4, 0.999995630f); | |
t3 = t0 * t3; | |
// slightly rearrange from the original code to better match the vectorized code | |
t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3; | |
t3 = (x < 0) ? -t3 + pi : t3; | |
t3 = (y < 0) ? -t3 : t3; | |
return t3; | |
} | |
#ifdef TRIG_APPROX_VECTORIZED | |
template<typename Packet> | |
inline const Packet packetOp(const Packet& y, const Packet& x) const { | |
using namespace internal; | |
using namespace trig_approx_utils; | |
Packet t0, t1, t3, t4; | |
t3 = pabs(x); // t3 = abs(x); | |
t1 = pabs(y); // t1 = abs(y); | |
t0 = pmax(t3, t1); // t0 = fmax(t3, t1); | |
t1 = pmin(t3, t1); // t1 = fmin(t3, t1) | |
t3 = pdiv(pset1<Packet>(1.0f), t0); // t3 = 1.0f / t0; | |
t3 = pmul(t1, t3); // t3 = t1 * t3; | |
t4 = pmul(t3, t3); // t4 = t3 * t3 | |
t0 = pset1<Packet>(-0.013480470f); // t0 = -0.013480470f; | |
t0 = pmadd(t0, t4, pset1<Packet>( 0.057477314f)); // t0 = fma(t0, t4, 0.057477314f); | |
t0 = pmadd(t0, t4, pset1<Packet>(-0.121239071f)); // t0 = fma(t0, t4, -0.121239071f); | |
t0 = pmadd(t0, t4, pset1<Packet>( 0.195635925f)); // t0 = fma(t0, t4, 0.195635925f); | |
t0 = pmadd(t0, t4, pset1<Packet>(-0.332994597f)); // t0 = fma(t0, t4, -0.332994597f); | |
t0 = pmadd(t0, t4, pset1<Packet>( 0.999995630f)); // t0 = fma(t0, t4, 0.999995630f); | |
t3 = pmul(t0, t3); // t3 = t0 * t3 | |
// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity. | |
Packet negate_mask = trig_approx_utils::negate_mask<Packet>(); | |
Packet mask; | |
// t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3; | |
mask = trig_approx_utils::cmp_gt(pabs(y), pabs(x)); | |
t3 = padd( | |
pxor(t3, pand(negate_mask, mask)), | |
pand(pset1<Packet>(pi_halved), mask) | |
); | |
// t3 = (x < 0) ? -t3 + pi : t3; | |
mask = cmp_lt(x, pset1<Packet>(0.0f)); | |
t3 = padd( | |
pxor(t3, pand(negate_mask, mask)), | |
pand(pset1<Packet>(pi), mask) | |
); | |
// t3 = (y < 0) ? -t3 : t3; | |
mask = cmp_lt(y, pset1<Packet>(0.0f)); | |
t3 = pxor(t3, pand(negate_mask, mask)); | |
return t3; | |
} | |
#endif | |
}; | |
namespace Eigen { | |
namespace internal { | |
template<> | |
struct functor_traits<approx_atan2_op> { | |
enum { | |
// More or less... | |
Cost = NumTraits<float>::MulCost * 15, | |
#ifdef TRIG_APPROX_VECTORIZED | |
// Vectorize please! Note that we instruct to vectorize regardless of what the packet | |
// type might be on the platform we compile this, because we want to make sure to get | |
// a compile error later on if we do not have code for that platform. | |
PacketAccess = 1 | |
#else | |
PacketAccess = 0 | |
#endif | |
}; | |
}; | |
} | |
} | |
template<typename Derived1, typename Derived2> | |
CwiseBinaryOp<approx_atan2_op, const Derived1, const Derived2> approxAtan2( | |
const ArrayBase<Derived1>& xs, const ArrayBase<Derived2>& ys | |
) { | |
return xs.template binaryExpr<approx_atan2_op, Derived2>(ys); | |
} | |
#endif | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment