Skip to content

Instantly share code, notes, and snippets.

@bitonic
Last active November 25, 2021 11:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bitonic/1ea0142a485ccdf5b3cc50b5b9e31ce1 to your computer and use it in GitHub Desktop.
Save bitonic/1ea0142a485ccdf5b3cc50b5b9e31ce1 to your computer and use it in GitHub Desktop.
Vectorized acos and atan2
// Vectorized (SSE and AVX, through Eigen) atan2 and acos.
//
// Care has been taken to ensure that the vectorized results are identical to the non-vectorized
// ones.
//
// Original functions from <https://developer.download.nvidia.com/cg/index_stdlib.html>, adapted
// to work in bulk. Both seem to be very accurate (only tested that error < 0.01
// degrees, which is what I need).
#include <Eigen/Core>
using namespace Eigen;
using namespace std;
#if defined(EIGEN_VECTORIZE_SSE) || defined(EIGEN_VECTORIZE_AVX)
#define TRIG_APPROX_VECTORIZED
#endif
namespace trig_approx_utils {
// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
template<typename Packet>
inline Packet negate_mask();
#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 negate_mask() {
return _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
}
#endif
#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 negate_mask() {
return _mm256_castsi256_ps(
_mm256_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000)
);
}
#endif
template<typename Packet>
inline Packet cmp_gt(const Packet& a, const Packet& b);
#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 cmp_gt(const __m128& a, const __m128& b) {
return _mm_cmpgt_ps(a, b);
}
#endif
#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 cmp_gt(const __m256& a, const __m256& b) {
return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
}
#endif
template<typename Packet>
inline Packet cmp_lt(const Packet& a, const Packet& b);
#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 cmp_lt(const __m128& a, const __m128& b) {
return _mm_cmplt_ps(a, b);
}
#endif
#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 cmp_lt(const __m256& a, const __m256& b) {
return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
}
#endif
// We need floats, M_PI will not be float since it's lacking the trailing f
constexpr float pi = 3.14159265359f;
constexpr float pi_halved = 1.57079632679f;
}
// Handbook of Mathematical Functions
// M. Abramowitz and I.A. Stegun, Ed.
// Absolute error <= 6.7e-5
struct approx_acos_op {
inline float operator()(float x) const {
using namespace trig_approx_utils;
float negate = static_cast<float>(x < 0);
x = abs(x);
float ret;
ret = -0.0187293f;
ret = fma(ret, x, 0.0742610f);
ret = fma(ret, x, -0.2121144f);
ret = fma(ret, x, 1.5707288f);
ret = ret * sqrt(1.0f - x);
ret = ret - 2.0f * negate * ret;
return fma(negate, pi, ret);
}
#ifdef TRIG_APPROX_VECTORIZED
template<typename Packet>
inline const Packet packetOp(const Packet& x0) const {
using namespace internal;
using namespace trig_approx_utils;
// float negate = static_cast<float>(x < 0);
Packet negate = cmp_lt(x0, pset1<Packet>(0.0f));
negate = pand(negate, pset1<Packet>(1.0f));
Packet x = pabs(x0); // x = abs(x);
Packet ret = pset1<Packet>(-0.0187293f); // float ret = -0.0187293f;
// ret = fma(ret, x, 0.0742610f);
ret = pmadd(ret, x, pset1<Packet>(0.0742610f));
// ret = fma(ret, x, -0.2121144f);
ret = pmadd(ret, x, pset1<Packet>(-0.2121144f));
// ret = fma(ret, x, 1.5707288f);
ret = pmadd(ret, x, pset1<Packet>(1.5707288f));
// ret = ret * sqrt(1.0f-x);
ret = pmul(ret, psqrt(psub(pset1<Packet>(1.0f), x)));
// ret = ret - 2.0f * negate * ret;
ret = psub(ret, pmul(pmul(pset1<Packet>(2.0f), negate), ret));
// return fma(negate, pi, ret);
return pmadd(negate, pset1<Packet>(pi), ret);
}
};
#endif
namespace Eigen {
namespace internal {
template<>
struct functor_traits<approx_acos_op> {
enum {
// More or less...
Cost = NumTraits<float>::MulCost * 5,
#ifdef TRIG_APPROX_VECTORIZED
// Vectorize please! Note that we instruct to vectorize regardless of what the packet
// type might be on the platform we compile this, because we want to make sure to get
// a compile error later on if we do not have code for that platform.
PacketAccess = 1
#else
PacketAccess = 0
#endif
};
};
}
}
template<typename Derived>
CwiseUnaryOp<approx_acos_op, const Derived> approxAcos(const ArrayBase<Derived>& xs) {
return xs.template unaryExpr<approx_acos_op>();
}
struct approx_atan2_op {
inline float operator()(float y, float x) const {
using namespace trig_approx_utils;
// The original code had an unused `float t2`, no idea why.
float t0, t1, t3, t4;
t3 = abs(x);
t1 = abs(y);
t0 = fmax(t3, t1);
t1 = fmin(t3, t1);
t3 = 1.0f / t0;
t3 = t1 * t3;
t4 = t3 * t3;
t0 = -0.013480470f;
t0 = fma(t0, t4, 0.057477314f);
t0 = fma(t0, t4, -0.121239071f);
t0 = fma(t0, t4, 0.195635925f);
t0 = fma(t0, t4, -0.332994597f);
t0 = fma(t0, t4, 0.999995630f);
t3 = t0 * t3;
// slightly rearrange from the original code to better match the vectorized code
t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
t3 = (x < 0) ? -t3 + pi : t3;
t3 = (y < 0) ? -t3 : t3;
return t3;
}
#ifdef TRIG_APPROX_VECTORIZED
template<typename Packet>
inline const Packet packetOp(const Packet& y, const Packet& x) const {
using namespace internal;
using namespace trig_approx_utils;
Packet t0, t1, t3, t4;
t3 = pabs(x); // t3 = abs(x);
t1 = pabs(y); // t1 = abs(y);
t0 = pmax(t3, t1); // t0 = fmax(t3, t1);
t1 = pmin(t3, t1); // t1 = fmin(t3, t1)
t3 = pdiv(pset1<Packet>(1.0f), t0); // t3 = 1.0f / t0;
t3 = pmul(t1, t3); // t3 = t1 * t3;
t4 = pmul(t3, t3); // t4 = t3 * t3
t0 = pset1<Packet>(-0.013480470f); // t0 = -0.013480470f;
t0 = pmadd(t0, t4, pset1<Packet>( 0.057477314f)); // t0 = fma(t0, t4, 0.057477314f);
t0 = pmadd(t0, t4, pset1<Packet>(-0.121239071f)); // t0 = fma(t0, t4, -0.121239071f);
t0 = pmadd(t0, t4, pset1<Packet>( 0.195635925f)); // t0 = fma(t0, t4, 0.195635925f);
t0 = pmadd(t0, t4, pset1<Packet>(-0.332994597f)); // t0 = fma(t0, t4, -0.332994597f);
t0 = pmadd(t0, t4, pset1<Packet>( 0.999995630f)); // t0 = fma(t0, t4, 0.999995630f);
t3 = pmul(t0, t3); // t3 = t0 * t3
// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
Packet negate_mask = trig_approx_utils::negate_mask<Packet>();
Packet mask;
// t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
mask = trig_approx_utils::cmp_gt(pabs(y), pabs(x));
t3 = padd(
pxor(t3, pand(negate_mask, mask)),
pand(pset1<Packet>(pi_halved), mask)
);
// t3 = (x < 0) ? -t3 + pi : t3;
mask = cmp_lt(x, pset1<Packet>(0.0f));
t3 = padd(
pxor(t3, pand(negate_mask, mask)),
pand(pset1<Packet>(pi), mask)
);
// t3 = (y < 0) ? -t3 : t3;
mask = cmp_lt(y, pset1<Packet>(0.0f));
t3 = pxor(t3, pand(negate_mask, mask));
return t3;
}
#endif
};
namespace Eigen {
namespace internal {
template<>
struct functor_traits<approx_atan2_op> {
enum {
// More or less...
Cost = NumTraits<float>::MulCost * 15,
#ifdef TRIG_APPROX_VECTORIZED
// Vectorize please! Note that we instruct to vectorize regardless of what the packet
// type might be on the platform we compile this, because we want to make sure to get
// a compile error later on if we do not have code for that platform.
PacketAccess = 1
#else
PacketAccess = 0
#endif
};
};
}
}
template<typename Derived1, typename Derived2>
CwiseBinaryOp<approx_atan2_op, const Derived1, const Derived2> approxAtan2(
const ArrayBase<Derived1>& xs, const ArrayBase<Derived2>& ys
) {
return xs.template binaryExpr<approx_atan2_op, Derived2>(ys);
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment