Sam-Belliveau/fast_trig.h

## fast_trig.h
// Copyright (c) 2024 Sam Belliveau. All rights reserved.
//
// This work is licensed under the terms of the MIT license.
// For a copy, see <https://opensource.org/licenses/MIT>.

/**
 * Below is a test ran on my M2 Max Macbook Pro. The results are not
 * guaranteed to be the same on other systems, but should be similar.
 *
 * This code is intended to be ran on embedded devices where the
 * performance gains are likely to be larger due to the lack of
 * branches, divisions, or memory accesses.
 *
 * $ g++ -O2 test.cpp -o ./test
 * $ ./test
 * std::sin<double>(): ~3.44ns
 *    __builtin_sin(): ~3.41ns
 *         fast_sin(): ~1.79ns
 *       faster_sin(): ~1.64ns
 *
 *  std::sin<float>(): ~3.41ns
 *   __builtin_sinf(): ~3.42ns
 *        fast_sinf(): ~1.62ns
 *      faster_sinf(): ~1.34ns
 *
 * std::cos<double>(): ~3.45ns
 *    __builtin_cos(): ~3.41ns
 *         fast_cos(): ~1.93ns
 *       faster_cos(): ~1.72ns
 *
 *  std::cos<float>(): ~3.48ns
 *   __builtin_cosf(): ~3.44ns
 *        fast_cosf(): ~1.81ns
 *      faster_cosf(): ~1.45ns
 */

/***** Function Definitions *****/

// fast & simple approximation for sin
// ∀ (x ∈ (-∞, ∞)) → (|fast_sin(x) - sin(x)| < 0.000051)
// 8 Multiplies, 4 Additions, 3 Absolute Values, 1 Round.
double fast_sin(double x);
float fast_sinf(float x);

// faster & simple 64bit approximation for sin
// ∀ (x ∈ (-∞, ∞)) → (|faster_sin(x) - sin(x)| < 0.0025)
// 6 Multiplies, 3 Additions, 2 Absolute Values, 1 Round.
double faster_sin(double x);
float faster_sinf(float x);

// fast & simple 64bit approximation for cos
// ∀ (x ∈ (-∞, ∞)) → (|fast_cos(x) - cos(x)| < 0.000051)
// 8 Multiplies, 6 Additions, 3 Absolute Values, 1 Floor.
double fast_cos(double x);
float fast_cosf(float x);

// faster & simple 64bit approximation for cos
// ∀ (x ∈ (-∞, ∞)) → (|faster_cos(x) - cos(x)| < 0.0025)
// 6 Multiplies, 5 Additions, 2 Absolute Values, 1 Floor.
double faster_cos(double x);
float faster_cosf(float x);

/***** Helper Functions *****/

static inline __attribute__((always_inline)) double _normalize_sin(double x);
static inline __attribute__((always_inline)) float _normalize_sinf(float x);

static inline __attribute__((always_inline)) double _normalize_cos(double x);
static inline __attribute__((always_inline)) float _normalize_cosf(float x);

static inline __attribute__((always_inline)) double _fast_unnormed(double x);
static inline __attribute__((always_inline)) float _fast_unnormedf(float x);

static inline __attribute__((always_inline)) double _faster_unnormed(double x);
static inline __attribute__((always_inline)) float _faster_unnormedf(float x);

/***** Trig Function Implementations *****/

double fast_sin(double x) { return _fast_unnormed(_normalize_sin(x)); }
float fast_sinf(float x) { return _fast_unnormedf(_normalize_sinf(x)); }

double faster_sin(double x) { return _faster_unnormed(_normalize_sin(x)); }
float faster_sinf(float x) { return _faster_unnormedf(_normalize_sinf(x)); }

double fast_cos(double x) { return _fast_unnormed(_normalize_cos(x)); }
float fast_cosf(float x) { return _fast_unnormedf(_normalize_cosf(x)); }

double faster_cos(double x) { return _faster_unnormed(_normalize_cos(x)); }
float faster_cosf(float x) { return _faster_unnormedf(_normalize_cosf(x)); }

/***** Helper Function Implementations *****/

double _normalize_sin(double x)
{
    x *= +0.1591549430919;
    x -= __builtin_floor(x + 0.5);
    x *= +6.2831853071796;
    return x;
}

float _normalize_sinf(float x)
{
    x *= +0.1591549430919f;
    x -= __builtin_floorf(x + 0.5f);
    x *= +6.2831853071796f;
    return x;
}

double _normalize_cos(double x)
{
    x *= +0.1591549430919;
    x -= __builtin_floor(x + 0.75) - 0.25;
    x *= +6.2831853071796;
    return x;
}

float _normalize_cosf(float x)
{
    x *= +0.1591549430919f;
    x -= __builtin_floorf(x + 0.75f) - 0.25f;
    x *= +6.2831853071796f;
    return x;
}

double _fast_unnormed(double x)
{
    x += -0.3183098861838 * x * __builtin_fabs(x);
    x += +0.2500000000000 * x * __builtin_fabs(x);
    x += +0.0684571845286 * x * __builtin_fabs(x);
    return x;
}

float _fast_unnormedf(float x)
{
    x += -0.3183098861838f * x * __builtin_fabsf(x);
    x += +0.2500000000000f * x * __builtin_fabsf(x);
    x += +0.0684571845286f * x * __builtin_fabsf(x);
    return x;
}

double _faster_unnormed(double x)
{
    x += -0.3183098861838 * x * __builtin_fabs(x);
    x += +0.3451140202480 * x * __builtin_fabs(x);
    return x;
}

float _faster_unnormedf(float x)
{
    x += -0.3183098861838f * x * __builtin_fabsf(x);
    x += +0.3451140202480f * x * __builtin_fabsf(x);
    return x;
}
	// Copyright (c) 2024 Sam Belliveau. All rights reserved.
	//
	// This work is licensed under the terms of the MIT license.
	// For a copy, see <https://opensource.org/licenses/MIT>.

	/**
	* Below is a test ran on my M2 Max Macbook Pro. The results are not
	* guaranteed to be the same on other systems, but should be similar.
	*
	* This code is intended to be ran on embedded devices where the
	* performance gains are likely to be larger due to the lack of
	* branches, divisions, or memory accesses.
	*
	* $ g++ -O2 test.cpp -o ./test
	* $ ./test
	* std::sin<double>(): ~3.44ns
	* __builtin_sin(): ~3.41ns
	* fast_sin(): ~1.79ns
	* faster_sin(): ~1.64ns
	*
	* std::sin<float>(): ~3.41ns
	* __builtin_sinf(): ~3.42ns
	* fast_sinf(): ~1.62ns
	* faster_sinf(): ~1.34ns
	*
	* std::cos<double>(): ~3.45ns
	* __builtin_cos(): ~3.41ns
	* fast_cos(): ~1.93ns
	* faster_cos(): ~1.72ns
	*
	* std::cos<float>(): ~3.48ns
	* __builtin_cosf(): ~3.44ns
	* fast_cosf(): ~1.81ns
	* faster_cosf(): ~1.45ns
	*/

	/*** Function Definitions ***/

	// fast & simple approximation for sin
	// ∀ (x ∈ (-∞, ∞)) → (\|fast_sin(x) - sin(x)\| < 0.000051)
	// 8 Multiplies, 4 Additions, 3 Absolute Values, 1 Round.
	double fast_sin(double x);
	float fast_sinf(float x);

	// faster & simple 64bit approximation for sin
	// ∀ (x ∈ (-∞, ∞)) → (\|faster_sin(x) - sin(x)\| < 0.0025)
	// 6 Multiplies, 3 Additions, 2 Absolute Values, 1 Round.
	double faster_sin(double x);
	float faster_sinf(float x);

	// fast & simple 64bit approximation for cos
	// ∀ (x ∈ (-∞, ∞)) → (\|fast_cos(x) - cos(x)\| < 0.000051)
	// 8 Multiplies, 6 Additions, 3 Absolute Values, 1 Floor.
	double fast_cos(double x);
	float fast_cosf(float x);

	// faster & simple 64bit approximation for cos
	// ∀ (x ∈ (-∞, ∞)) → (\|faster_cos(x) - cos(x)\| < 0.0025)
	// 6 Multiplies, 5 Additions, 2 Absolute Values, 1 Floor.
	double faster_cos(double x);
	float faster_cosf(float x);

	/*** Helper Functions ***/

	static inline __attribute__((always_inline)) double _normalize_sin(double x);
	static inline __attribute__((always_inline)) float _normalize_sinf(float x);

	static inline __attribute__((always_inline)) double _normalize_cos(double x);
	static inline __attribute__((always_inline)) float _normalize_cosf(float x);

	static inline __attribute__((always_inline)) double _fast_unnormed(double x);
	static inline __attribute__((always_inline)) float _fast_unnormedf(float x);

	static inline __attribute__((always_inline)) double _faster_unnormed(double x);
	static inline __attribute__((always_inline)) float _faster_unnormedf(float x);

	/*** Trig Function Implementations ***/

	double fast_sin(double x) { return _fast_unnormed(_normalize_sin(x)); }
	float fast_sinf(float x) { return _fast_unnormedf(_normalize_sinf(x)); }

	double faster_sin(double x) { return _faster_unnormed(_normalize_sin(x)); }
	float faster_sinf(float x) { return _faster_unnormedf(_normalize_sinf(x)); }

	double fast_cos(double x) { return _fast_unnormed(_normalize_cos(x)); }
	float fast_cosf(float x) { return _fast_unnormedf(_normalize_cosf(x)); }

	double faster_cos(double x) { return _faster_unnormed(_normalize_cos(x)); }
	float faster_cosf(float x) { return _faster_unnormedf(_normalize_cosf(x)); }

	/*** Helper Function Implementations ***/

	double _normalize_sin(double x)
	{
	x *= +0.1591549430919;
	x -= __builtin_floor(x + 0.5);
	x *= +6.2831853071796;
	return x;
	}

	float _normalize_sinf(float x)
	{
	x *= +0.1591549430919f;
	x -= __builtin_floorf(x + 0.5f);
	x *= +6.2831853071796f;
	return x;
	}

	double _normalize_cos(double x)
	{
	x *= +0.1591549430919;
	x -= __builtin_floor(x + 0.75) - 0.25;
	x *= +6.2831853071796;
	return x;
	}

	float _normalize_cosf(float x)
	{
	x *= +0.1591549430919f;
	x -= __builtin_floorf(x + 0.75f) - 0.25f;
	x *= +6.2831853071796f;
	return x;
	}

	double _fast_unnormed(double x)
	{
	x += -0.3183098861838 * x * __builtin_fabs(x);
	x += +0.2500000000000 * x * __builtin_fabs(x);
	x += +0.0684571845286 * x * __builtin_fabs(x);
	return x;
	}

	float _fast_unnormedf(float x)
	{
	x += -0.3183098861838f * x * __builtin_fabsf(x);
	x += +0.2500000000000f * x * __builtin_fabsf(x);
	x += +0.0684571845286f * x * __builtin_fabsf(x);
	return x;
	}

	double _faster_unnormed(double x)
	{
	x += -0.3183098861838 * x * __builtin_fabs(x);
	x += +0.3451140202480 * x * __builtin_fabs(x);
	return x;
	}

	float _faster_unnormedf(float x)
	{
	x += -0.3183098861838f * x * __builtin_fabsf(x);
	x += +0.3451140202480f * x * __builtin_fabsf(x);
	return x;
	}