bitonic/TrigApprox.h

## TrigApprox.h
// Vectorized (SSE and AVX, through Eigen) atan2 and acos.
//
// Care has been taken to ensure that the vectorized results are identical to the non-vectorized
// ones.
//
// Original functions from <https://developer.download.nvidia.com/cg/index_stdlib.html>, adapted
// to work in bulk. Both seem to be very accurate (only tested that error < 0.01
// degrees, which is what I need).

#include <Eigen/Core>

using namespace Eigen;
using namespace std;

#if defined(EIGEN_VECTORIZE_SSE) || defined(EIGEN_VECTORIZE_AVX)
#define TRIG_APPROX_VECTORIZED
#endif

namespace trig_approx_utils {

// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
template<typename Packet>
inline Packet negate_mask();

#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 negate_mask() {
  return _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
}
#endif

#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 negate_mask() {
  return _mm256_castsi256_ps(
    _mm256_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000)
  );
}
#endif

template<typename Packet>
inline Packet cmp_gt(const Packet& a, const Packet& b);

#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 cmp_gt(const __m128& a, const __m128& b) {
  return _mm_cmpgt_ps(a, b);
}
#endif

#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 cmp_gt(const __m256& a, const __m256& b) {
  return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
}
#endif

template<typename Packet>
inline Packet cmp_lt(const Packet& a, const Packet& b);

#ifdef EIGEN_VECTORIZE_SSE
template<>
inline __m128 cmp_lt(const __m128& a, const __m128& b) {
  return _mm_cmplt_ps(a, b);
}
#endif

#ifdef EIGEN_VECTORIZE_AVX
template<>
inline __m256 cmp_lt(const __m256& a, const __m256& b) {
  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
}
#endif

// We need floats, M_PI will not be float since it's lacking the trailing f
constexpr float pi = 3.14159265359f;
constexpr float pi_halved = 1.57079632679f;

}

// Handbook of Mathematical Functions
// M. Abramowitz and I.A. Stegun, Ed.
// Absolute error <= 6.7e-5
struct approx_acos_op {
  inline float operator()(float x) const {
    using namespace trig_approx_utils;

    float negate = static_cast<float>(x < 0);
    x = abs(x);
    float ret;
    ret =             -0.0187293f;
    ret = fma(ret, x,  0.0742610f);
    ret = fma(ret, x, -0.2121144f);
    ret = fma(ret, x,  1.5707288f);
    ret = ret * sqrt(1.0f - x);
    ret = ret - 2.0f * negate * ret;
    return fma(negate, pi, ret);
  }

#ifdef TRIG_APPROX_VECTORIZED
  template<typename Packet>
  inline const Packet packetOp(const Packet& x0) const {
    using namespace internal;
    using namespace trig_approx_utils;

    // float negate = static_cast<float>(x < 0);
    Packet negate = cmp_lt(x0, pset1<Packet>(0.0f));
    negate = pand(negate, pset1<Packet>(1.0f));

    Packet x = pabs(x0); // x = abs(x);
    Packet ret = pset1<Packet>(-0.0187293f); // float ret = -0.0187293f;
    // ret = fma(ret, x, 0.0742610f);
    ret = pmadd(ret, x, pset1<Packet>(0.0742610f));
    // ret = fma(ret, x, -0.2121144f);
    ret = pmadd(ret, x, pset1<Packet>(-0.2121144f));
    // ret = fma(ret, x, 1.5707288f);
    ret = pmadd(ret, x, pset1<Packet>(1.5707288f));
    // ret = ret * sqrt(1.0f-x);
    ret = pmul(ret, psqrt(psub(pset1<Packet>(1.0f), x)));
    // ret = ret - 2.0f * negate * ret;
    ret = psub(ret, pmul(pmul(pset1<Packet>(2.0f), negate), ret));
    // return fma(negate, pi, ret);
    return pmadd(negate, pset1<Packet>(pi), ret);
  }
};
#endif

namespace Eigen {
namespace internal {

template<>
struct functor_traits<approx_acos_op> {
  enum {
    // More or less...
    Cost = NumTraits<float>::MulCost * 5,
#ifdef TRIG_APPROX_VECTORIZED
    // Vectorize please! Note that we instruct to vectorize regardless of what the packet
    // type might be on the platform we compile this, because we want to make sure to get
    // a compile error later on if we do not have code for that platform.
    PacketAccess = 1
#else
    PacketAccess = 0
#endif
  };
};

}
}

template<typename Derived>
CwiseUnaryOp<approx_acos_op, const Derived> approxAcos(const ArrayBase<Derived>& xs) {
  return xs.template unaryExpr<approx_acos_op>();
}

struct approx_atan2_op {
  inline float operator()(float y, float x) const {
    using namespace trig_approx_utils;

    // The original code had an unused `float t2`, no idea why.
    float t0, t1, t3, t4;

    t3 = abs(x);
    t1 = abs(y);
    t0 = fmax(t3, t1);
    t1 = fmin(t3, t1);
    t3 = 1.0f / t0;
    t3 = t1 * t3;

    t4 = t3 * t3;
    t0 =             -0.013480470f;
    t0 = fma(t0, t4,  0.057477314f);
    t0 = fma(t0, t4, -0.121239071f);
    t0 = fma(t0, t4,  0.195635925f);
    t0 = fma(t0, t4, -0.332994597f);
    t0 = fma(t0, t4,  0.999995630f);
    t3 = t0 * t3;

    // slightly rearrange from the original code to better match the vectorized code
    t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
    t3 = (x < 0) ?  -t3 + pi : t3;
    t3 = (y < 0) ? -t3 : t3;

    return t3;
  }

#ifdef TRIG_APPROX_VECTORIZED
  template<typename Packet>
  inline const Packet packetOp(const Packet& y, const Packet& x) const {
    using namespace internal;
    using namespace trig_approx_utils;

    Packet t0, t1, t3, t4;

    t3 = pabs(x); // t3 = abs(x);
    t1 = pabs(y); // t1 = abs(y);
    t0 = pmax(t3, t1); // t0 = fmax(t3, t1);
    t1 = pmin(t3, t1); // t1 = fmin(t3, t1)
    t3 = pdiv(pset1<Packet>(1.0f), t0); // t3 = 1.0f / t0;
    t3 = pmul(t1, t3); // t3 = t1 * t3;

    t4 = pmul(t3, t3); // t4 = t3 * t3
    t0 =               pset1<Packet>(-0.013480470f);  // t0 = -0.013480470f;
    t0 = pmadd(t0, t4, pset1<Packet>( 0.057477314f)); // t0 = fma(t0, t4,  0.057477314f);
    t0 = pmadd(t0, t4, pset1<Packet>(-0.121239071f)); // t0 = fma(t0, t4, -0.121239071f);
    t0 = pmadd(t0, t4, pset1<Packet>( 0.195635925f)); // t0 = fma(t0, t4,  0.195635925f);
    t0 = pmadd(t0, t4, pset1<Packet>(-0.332994597f)); // t0 = fma(t0, t4, -0.332994597f);
    t0 = pmadd(t0, t4, pset1<Packet>( 0.999995630f)); // t0 = fma(t0, t4,  0.999995630f);
    t3 = pmul(t0, t3); // t3 = t0 * t3

    // XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
    Packet negate_mask = trig_approx_utils::negate_mask<Packet>();

    Packet mask;

    // t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
    mask = trig_approx_utils::cmp_gt(pabs(y), pabs(x));
    t3 = padd(
      pxor(t3, pand(negate_mask, mask)),
      pand(pset1<Packet>(pi_halved), mask)
    );

    // t3 = (x < 0) ?  -t3 + pi : t3;
    mask = cmp_lt(x, pset1<Packet>(0.0f));
    t3 = padd(
      pxor(t3, pand(negate_mask, mask)),
      pand(pset1<Packet>(pi), mask)
    );

    // t3 = (y < 0) ? -t3 : t3;
    mask = cmp_lt(y, pset1<Packet>(0.0f));
    t3 = pxor(t3, pand(negate_mask, mask));

    return t3;
  }
#endif
};

namespace Eigen {
namespace internal {

template<>
struct functor_traits<approx_atan2_op> {
  enum {
    // More or less...
    Cost = NumTraits<float>::MulCost * 15,
#ifdef TRIG_APPROX_VECTORIZED
    // Vectorize please! Note that we instruct to vectorize regardless of what the packet
    // type might be on the platform we compile this, because we want to make sure to get
    // a compile error later on if we do not have code for that platform.
    PacketAccess = 1
#else
    PacketAccess = 0
#endif
  };
};

}
}

template<typename Derived1, typename Derived2>
CwiseBinaryOp<approx_atan2_op, const Derived1, const Derived2> approxAtan2(
  const ArrayBase<Derived1>& xs, const ArrayBase<Derived2>& ys
) {
  return xs.template binaryExpr<approx_atan2_op, Derived2>(ys);
}

#endif
	// Vectorized (SSE and AVX, through Eigen) atan2 and acos.
	//
	// Care has been taken to ensure that the vectorized results are identical to the non-vectorized
	// ones.
	//
	// Original functions from <https://developer.download.nvidia.com/cg/index_stdlib.html>, adapted
	// to work in bulk. Both seem to be very accurate (only tested that error < 0.01
	// degrees, which is what I need).

	#include <Eigen/Core>

	using namespace Eigen;
	using namespace std;

	#if defined(EIGEN_VECTORIZE_SSE) \|\| defined(EIGEN_VECTORIZE_AVX)
	#define TRIG_APPROX_VECTORIZED
	#endif

	namespace trig_approx_utils {

	// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
	template<typename Packet>
	inline Packet negate_mask();

	#ifdef EIGEN_VECTORIZE_SSE
	template<>
	inline __m128 negate_mask() {
	return _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
	}
	#endif

	#ifdef EIGEN_VECTORIZE_AVX
	template<>
	inline __m256 negate_mask() {
	return _mm256_castsi256_ps(
	_mm256_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000)
	);
	}
	#endif

	template<typename Packet>
	inline Packet cmp_gt(const Packet& a, const Packet& b);

	#ifdef EIGEN_VECTORIZE_SSE
	template<>
	inline __m128 cmp_gt(const __m128& a, const __m128& b) {
	return _mm_cmpgt_ps(a, b);
	}
	#endif

	#ifdef EIGEN_VECTORIZE_AVX
	template<>
	inline __m256 cmp_gt(const __m256& a, const __m256& b) {
	return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
	}
	#endif

	template<typename Packet>
	inline Packet cmp_lt(const Packet& a, const Packet& b);

	#ifdef EIGEN_VECTORIZE_SSE
	template<>
	inline __m128 cmp_lt(const __m128& a, const __m128& b) {
	return _mm_cmplt_ps(a, b);
	}
	#endif

	#ifdef EIGEN_VECTORIZE_AVX
	template<>
	inline __m256 cmp_lt(const __m256& a, const __m256& b) {
	return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
	}
	#endif

	// We need floats, M_PI will not be float since it's lacking the trailing f
	constexpr float pi = 3.14159265359f;
	constexpr float pi_halved = 1.57079632679f;

	}

	// Handbook of Mathematical Functions
	// M. Abramowitz and I.A. Stegun, Ed.
	// Absolute error <= 6.7e-5
	struct approx_acos_op {
	inline float operator()(float x) const {
	using namespace trig_approx_utils;

	float negate = static_cast<float>(x < 0);
	x = abs(x);
	float ret;
	ret = -0.0187293f;
	ret = fma(ret, x, 0.0742610f);
	ret = fma(ret, x, -0.2121144f);
	ret = fma(ret, x, 1.5707288f);
	ret = ret * sqrt(1.0f - x);
	ret = ret - 2.0f * negate * ret;
	return fma(negate, pi, ret);
	}

	#ifdef TRIG_APPROX_VECTORIZED
	template<typename Packet>
	inline const Packet packetOp(const Packet& x0) const {
	using namespace internal;
	using namespace trig_approx_utils;

	// float negate = static_cast<float>(x < 0);
	Packet negate = cmp_lt(x0, pset1<Packet>(0.0f));
	negate = pand(negate, pset1<Packet>(1.0f));

	Packet x = pabs(x0); // x = abs(x);
	Packet ret = pset1<Packet>(-0.0187293f); // float ret = -0.0187293f;
	// ret = fma(ret, x, 0.0742610f);
	ret = pmadd(ret, x, pset1<Packet>(0.0742610f));
	// ret = fma(ret, x, -0.2121144f);
	ret = pmadd(ret, x, pset1<Packet>(-0.2121144f));
	// ret = fma(ret, x, 1.5707288f);
	ret = pmadd(ret, x, pset1<Packet>(1.5707288f));
	// ret = ret * sqrt(1.0f-x);
	ret = pmul(ret, psqrt(psub(pset1<Packet>(1.0f), x)));
	// ret = ret - 2.0f * negate * ret;
	ret = psub(ret, pmul(pmul(pset1<Packet>(2.0f), negate), ret));
	// return fma(negate, pi, ret);
	return pmadd(negate, pset1<Packet>(pi), ret);
	}
	};
	#endif

	namespace Eigen {
	namespace internal {

	template<>
	struct functor_traits<approx_acos_op> {
	enum {
	// More or less...
	Cost = NumTraits<float>::MulCost * 5,
	#ifdef TRIG_APPROX_VECTORIZED
	// Vectorize please! Note that we instruct to vectorize regardless of what the packet
	// type might be on the platform we compile this, because we want to make sure to get
	// a compile error later on if we do not have code for that platform.
	PacketAccess = 1
	#else
	PacketAccess = 0
	#endif
	};
	};

	}
	}

	template<typename Derived>
	CwiseUnaryOp<approx_acos_op, const Derived> approxAcos(const ArrayBase<Derived>& xs) {
	return xs.template unaryExpr<approx_acos_op>();
	}

	struct approx_atan2_op {
	inline float operator()(float y, float x) const {
	using namespace trig_approx_utils;

	// The original code had an unused `float t2`, no idea why.
	float t0, t1, t3, t4;

	t3 = abs(x);
	t1 = abs(y);
	t0 = fmax(t3, t1);
	t1 = fmin(t3, t1);
	t3 = 1.0f / t0;
	t3 = t1 * t3;

	t4 = t3 * t3;
	t0 = -0.013480470f;
	t0 = fma(t0, t4, 0.057477314f);
	t0 = fma(t0, t4, -0.121239071f);
	t0 = fma(t0, t4, 0.195635925f);
	t0 = fma(t0, t4, -0.332994597f);
	t0 = fma(t0, t4, 0.999995630f);
	t3 = t0 * t3;

	// slightly rearrange from the original code to better match the vectorized code
	t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
	t3 = (x < 0) ? -t3 + pi : t3;
	t3 = (y < 0) ? -t3 : t3;

	return t3;
	}

	#ifdef TRIG_APPROX_VECTORIZED
	template<typename Packet>
	inline const Packet packetOp(const Packet& y, const Packet& x) const {
	using namespace internal;
	using namespace trig_approx_utils;

	Packet t0, t1, t3, t4;

	t3 = pabs(x); // t3 = abs(x);
	t1 = pabs(y); // t1 = abs(y);
	t0 = pmax(t3, t1); // t0 = fmax(t3, t1);
	t1 = pmin(t3, t1); // t1 = fmin(t3, t1)
	t3 = pdiv(pset1<Packet>(1.0f), t0); // t3 = 1.0f / t0;
	t3 = pmul(t1, t3); // t3 = t1 * t3;

	t4 = pmul(t3, t3); // t4 = t3 * t3
	t0 = pset1<Packet>(-0.013480470f); // t0 = -0.013480470f;
	t0 = pmadd(t0, t4, pset1<Packet>( 0.057477314f)); // t0 = fma(t0, t4, 0.057477314f);
	t0 = pmadd(t0, t4, pset1<Packet>(-0.121239071f)); // t0 = fma(t0, t4, -0.121239071f);
	t0 = pmadd(t0, t4, pset1<Packet>( 0.195635925f)); // t0 = fma(t0, t4, 0.195635925f);
	t0 = pmadd(t0, t4, pset1<Packet>(-0.332994597f)); // t0 = fma(t0, t4, -0.332994597f);
	t0 = pmadd(t0, t4, pset1<Packet>( 0.999995630f)); // t0 = fma(t0, t4, 0.999995630f);
	t3 = pmul(t0, t3); // t3 = t0 * t3

	// XOR'ing with 0x80000000 negates floats. XOR'ing with 0x0 is identity.
	Packet negate_mask = trig_approx_utils::negate_mask<Packet>();

	Packet mask;

	// t3 = (abs(y) > abs(x)) ? -t3 + pi_halved : t3;
	mask = trig_approx_utils::cmp_gt(pabs(y), pabs(x));
	t3 = padd(
	pxor(t3, pand(negate_mask, mask)),
	pand(pset1<Packet>(pi_halved), mask)
	);

	// t3 = (x < 0) ? -t3 + pi : t3;
	mask = cmp_lt(x, pset1<Packet>(0.0f));
	t3 = padd(
	pxor(t3, pand(negate_mask, mask)),
	pand(pset1<Packet>(pi), mask)
	);

	// t3 = (y < 0) ? -t3 : t3;
	mask = cmp_lt(y, pset1<Packet>(0.0f));
	t3 = pxor(t3, pand(negate_mask, mask));

	return t3;
	}
	#endif
	};

	namespace Eigen {
	namespace internal {

	template<>
	struct functor_traits<approx_atan2_op> {
	enum {
	// More or less...
	Cost = NumTraits<float>::MulCost * 15,
	#ifdef TRIG_APPROX_VECTORIZED
	// Vectorize please! Note that we instruct to vectorize regardless of what the packet
	// type might be on the platform we compile this, because we want to make sure to get
	// a compile error later on if we do not have code for that platform.
	PacketAccess = 1
	#else
	PacketAccess = 0
	#endif
	};
	};

	}
	}

	template<typename Derived1, typename Derived2>
	CwiseBinaryOp<approx_atan2_op, const Derived1, const Derived2> approxAtan2(
	const ArrayBase<Derived1>& xs, const ArrayBase<Derived2>& ys
	) {
	return xs.template binaryExpr<approx_atan2_op, Derived2>(ys);
	}

	#endif