williamyang98/calculate_horner_polynomial_gradient_descent.cpp

## calculate_horner_polynomial_gradient_descent.cpp
// Compiling with clang or gcc
// clang main.cpp -o main -march=native -O3 -ffast-math -Wextra -Werror -Wunused-parameter -lm
// Use "-march=native -ffast-math -O3" for faster gradient descent
// Use "-lm" if linking to glibc math libraries on Linux
// Remove "-W*" warning flags if you don't need checks

#define _USE_MATH_DEFINES
#include <stdio.h>
#include <cmath>
#include <vector>

#define USE_SIGNAL_HANDLER 1

#if USE_SIGNAL_HANDLER
static bool is_running = true;
#if _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
BOOL WINAPI sighandler(DWORD signum) {
    if (signum == CTRL_C_EVENT) {
        fprintf(stderr, "Signal caught, exiting!\n");
        is_running = false;
        return TRUE;
    }
    return FALSE;
}
#else
#include <errno.h>
#include <signal.h>
static void sighandler(int signum) {
    fprintf(stderr, "Signal caught, exiting! (%d)\n", signum);
    is_running = false;
}
#endif
#endif

// Configuration
using grad_t = double;
using coefficient_t = double;
constexpr size_t PRINT_ITER = 10'000'000;
constexpr size_t TOTAL_ITER_ERROR_PLATEAU_THRESHOLD = 10'000;

// Learning rate update rule when minimum error reaches a plateau
// - If we update than rescale the learning rate
// - Otherwise we exit early
constexpr bool IS_UPDATE_LEARNING_RATE = true;
constexpr grad_t LEARNING_RATE_RESCALER = 1.1;

// We are solving for a[n] in g(x) = sum a[n]*x^(2n)
// Where f(x) = h(x)*g(x), where:
//   - f(x) = target function
//   - h(x) = modifier function to reduce complexity of function g(x) needs to solve
//   - g(x) = polynomial that can be calculated using Horner's method
//            https://en.wikipedia.org/wiki/Horner%27s_method

// Gradient descent hyperparameters
// - We are approximating f(x) in the range: x = [X_SAMPLE_START, X_SAMPLE_END] with TOTAL_SAMPLES
// - g(x) = sum a[n]*x^(2n) for n = [0, TOTAL_COEFFICIENTS-1]
// - learning rate: INITIAL_LEARNING_RATE
//   - Decrease learning rate if unstable
// - rescaling gradients for de/da[n] by: x^(n*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS)
//   - NOTE: This drastically improves training speed, you should tweak it
//   - Decrease GRAD_RESCALE_MAGNITUDE if training is unstable
//   - NOTE: You should tweak this value before tweaking learning rate

// Sample functions to approximate
#if 1
// y = sin(2*pi*x)
template <typename T>
inline static T get_y_target(T x) {
    return std::sin(T(M_PI)*2.0*x);
}
template <typename T>
inline static T get_h(T x) {
    // constraint so that roots match sin(2*pi*x)
    constexpr T B0 = T(0.5*0.5);
    const T z = x*x;
    return x*(z-B0);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5);
constexpr size_t TOTAL_COEFFICIENTS = 6;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#elif 0
// y = cos(2*pi*x)
template <typename T>
inline static T get_y_target(T x) {
    return std::cos(T(M_PI)*2.0*x);
}
template <typename T>
inline static T get_h(T x) {
    // constraint so that roots match cos(2*pi*x)
    constexpr T B0 = T(0.25*0.25);
    constexpr T B1 = T(0.75*0.75);
    const T z = x*x;
    return (z-B0)*(z-B1);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
constexpr size_t TOTAL_COEFFICIENTS = 6;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#else
// y = x^2 * exp(pi*x^2) * sin(2pix)
template <typename T>
inline static T get_y_target(T x) {
    const T z = x*x;
    return z*std::exp(T(M_PI)*z)*std::sin(2*T(M_PI)*x);
}
template <typename T>
inline static T get_h(T x) {
    constexpr T B0 = T(0.5*0.5);
    const T z = x*x;
    return z*x*(z-B0);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
constexpr size_t TOTAL_COEFFICIENTS = 8;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 3.6; // needed to be decreased for stable gradient descent
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#endif

inline static coefficient_t get_horner_polynomial(coefficient_t x, coefficient_t* A) {
    // g(x) = sum a[n]*x^(2n)
    // Use Horner's method to calculate this efficiently
    // https://en.wikipedia.org/wiki/Horner%27s_method
    const coefficient_t z = x*x;
    coefficient_t g = A[TOTAL_COEFFICIENTS-1];
    for (size_t i = 1; i < TOTAL_COEFFICIENTS; i++) {
        size_t j = TOTAL_COEFFICIENTS-i-1;
        g = g*z + A[j];
    }
    return g;
}

int main(int /*argc*/, char** /*argv*/) {
#if USE_SIGNAL_HANDLER
#if _WIN32
    SetConsoleCtrlHandler(sighandler, TRUE);
#else
    struct sigaction sigact;
    sigact.sa_handler = sighandler;
    sigemptyset(&sigact.sa_mask);
    sigact.sa_flags = 0;
    sigaction(SIGINT, &sigact, nullptr);
    sigaction(SIGTERM, &sigact, nullptr);
    sigaction(SIGQUIT, &sigact, nullptr);
    sigaction(SIGPIPE, &sigact, nullptr);
#endif
#endif
    coefficient_t poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};

    static_assert(X_SAMPLE_END > X_SAMPLE_START, "Range of evaluation must be from x_min to x_max");
    constexpr grad_t EVAL_STEP = (X_SAMPLE_END-X_SAMPLE_START)/grad_t(TOTAL_SAMPLES);
    auto X_in = std::vector<grad_t>(TOTAL_SAMPLES);
    auto Y_expected = std::vector<grad_t>(TOTAL_SAMPLES);
    auto H_in = std::vector<grad_t>(TOTAL_SAMPLES);
    for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
        const grad_t x = X_SAMPLE_START + grad_t(i)*EVAL_STEP;
        const grad_t y = get_y_target(x);
        const grad_t h = get_h(x);
        X_in[i] = x;
        Y_expected[i] = y;
        H_in[i] = h;
    }

    // normalised gradients for each term
    // de/da[n] = 2*(f-y)*h*x^2n
    //          = 2*e(x)*h*x^2n
    // Assume e(x) converges to 0, then e_max = e^2n
    // de/da[n] = 2*h*x^4n
    // In reality the x^4n factor is approximated as x^(an+b)
    // where a,b are hyperparameters
    grad_t gradient_scale[TOTAL_COEFFICIENTS] = {0.0};
    for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
        grad_t scale = 0.0;
        for (size_t j = 0; j < TOTAL_SAMPLES; j++) {
            const grad_t x = grad_t(X_in[j]);
            const grad_t h = grad_t(H_in[j]);
            const grad_t n = grad_t(i)*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS;
            const grad_t x_n = std::pow(x, n);
            scale += (h*x_n);
        }
        scale = std::abs(scale);
        scale /= grad_t(TOTAL_SAMPLES);
        gradient_scale[i] = grad_t(1) / scale;
    }
    printf("grad_rescale = [");
    for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
        printf("%.2e,", gradient_scale[i]);
    }
    printf("]\n");
    printf("is_update_learning_rate: %u\n", IS_UPDATE_LEARNING_RATE);

    constexpr grad_t NORM_SAMPLES = 1.0f / grad_t(TOTAL_SAMPLES);
    grad_t learning_rate = INITIAL_LEARNING_RATE;
    coefficient_t best_poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};
    grad_t error_minimum = grad_t(1e6);
    size_t error_minimum_iter = 0;
    size_t total_iter_error_plateau = 0;
    for (size_t iter = 0; ; iter++) {
        grad_t total_error = grad_t(0);
        grad_t gradients[TOTAL_COEFFICIENTS] = {grad_t(0)};
        for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
            const coefficient_t x = X_in[i];
            const coefficient_t y_given = coefficient_t(H_in[i])*get_horner_polynomial(coefficient_t(x), poly);
            const grad_t y_expected = Y_expected[i];
            const grad_t error = grad_t(y_given) - y_expected;
            // gradient descent
            // g(x) = sum a[n]*x^(2n)
            // f(x) = g(x)*h(x)
            // e(x) = [f(x) - y]^2
            // de/df = 2*(f-y)
            // df/dg = h
            // de/dg = de/df * df/dg, chain rule
            // de/dg = 2*(f-y)*h
            // de/da[n] = de/dg * dg/da[n]
            // dg/da[n] = x^(2n)
            // de/da[n] = 2*(f-y)*h*x^2n
            const grad_t z = x*x;
            const grad_t h = H_in[i];
            grad_t gradient = error*h*grad_t(2);
            gradients[0] += gradient;
            for (size_t j = 1; j < TOTAL_COEFFICIENTS; j++) {
                gradient *= z;
                gradients[j] += gradient;
            }
            total_error += std::abs(error);
        }

        // normalised error and gradient
        total_error *= NORM_SAMPLES;
        for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
            const grad_t scale = NORM_SAMPLES * gradient_scale[i];
            gradients[i] *= scale;
        }

        // save lowest error polynomial
        if (total_error < error_minimum && !std::isnan(total_error)) {
            error_minimum = total_error;
            error_minimum_iter = iter;
            total_iter_error_plateau = 0;
            for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
                best_poly[i] = poly[i];
            }
        } else {
            total_iter_error_plateau++;
        }

        if (iter % PRINT_ITER == 0) {
            printf("[%zu] error=%.2e, grad=[", iter, total_error);
            for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
                printf("%.2e,", gradients[i]);
            }
            printf("]\n");
        }

        // apply gradients
        bool has_nan = false;
        for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
            const grad_t delta = gradients[i] * learning_rate;
            const grad_t coeff = grad_t(poly[i]) - delta;
            poly[i] = coefficient_t(coeff);
            if (std::abs(coeff) > 1e5f) {
                printf("[%zu] [error] Detected exploding coefficient exiting\n", iter);
                has_nan = true;
                break;
            }
            if (std::isnan(coeff) || std::isinf(coeff)) {
                printf("[%zu] [error] Detected NAN in coefficient exiting\n", iter);
                has_nan = true;
                break;
            }
        }
        if (has_nan) {
            printf("[%zu] [error] Early exit due to bad values, printing them now\n", iter);
            printf("  learning_rate: %.2e\n", learning_rate);
            printf("  grad: [");
            for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
                printf("%.2e,", gradients[i]);
            }
            printf("]\n");
            printf("  poly: [");
            for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
                printf("%.2e,", poly[i]);
            }
            printf("]\n");
            break;
        }
        if (total_iter_error_plateau >= TOTAL_ITER_ERROR_PLATEAU_THRESHOLD) {
            if (IS_UPDATE_LEARNING_RATE) {
                total_iter_error_plateau = 0;
                learning_rate *= LEARNING_RATE_RESCALER;
            } else {
                printf("[%zu] [info] Exiting since error has plateaued after %zu iters\n",
                    iter, TOTAL_ITER_ERROR_PLATEAU_THRESHOLD);
                break;
            }
        }
        if (!is_running) break;
    }
    printf("\n[BEST RESULTS]\n");
    printf("step: %zu\n", error_minimum_iter);
    printf("mean_absolute_error: %.2e\n", error_minimum);
    printf("poly[%zu] = {", TOTAL_COEFFICIENTS);
    for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
        constexpr size_t total_dp = (sizeof(coefficient_t) == 4) ? 10 : 16;
        printf("%.*f", int(total_dp), best_poly[i]);
        if (i != TOTAL_COEFFICIENTS-1) {
            printf(",");
        }
    }
    printf("}\n");
    return 0;
}

## chebyshev_direct_computation.ipynb.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              chebyshev_direct_computation.ipynb.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## chebyshev_math_formulas.txt
// Definition of Chebyshev polynomial
Tn(cos(x)) = cos(nx)
T0(cos(x)) = cos(0x) = 1
T1(cos(x)) = cos(x)
T2(cos(x)) = cos(2x) = 2cos(x)^2 - 1
Let X = cos(x)
- T0(X) = 1
- T1(X) = X
- T2(X) = 2X^2 - 1

// Deriving trigonometric identities
e^j(a+b) = e^ja * e^jb
cis(a+b) = cis(a) * cis(b)
cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
sin(a+b) = cos(a)sin(b) + sin(a)cos(b)

e^j(a-b) = e^ja * e^j(-b)
cis(a-b) = cis(a)*cis(-b)
cos(a-b) = cos(a)cos(b) + sin(a)sin(b)
sin(a-b) = -cos(a)sin(b) + sin(a)cos(b)

cos(a+b) + cos(a-b) = 2cos(a)cos(b)
cos(a)cos(b) = 1/2 * [ cos(a+b) + cos(a-b) ]

cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)

// Derive recursive definition for Chebyshev polynomials using trigonometric identities
T_(n+1)(cos(x)) = cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)
T_(n+1)(X) = X*Tn(X) - sin(x)sin(nx)

T_(n-1)(cos(x)) = cos(-x)cos(nx) - sin(-x)sin(nx)
T_(n-1)(X) = X*T_n(X) + sin(x)sin(nx)

T_(n+1)(X) + T_(n-1)(X) = 2*X*T_n(X)
T_(n+1)(X) = 2*X*T_n(X) - T_(n-1)(X)

T3(X) = 2X*T2(X) - T1(X)
T3(X) = 2X*(2X^2-1) - X
T3(X) = 4X^3 - 3X

// Derivation of closed form solutions to cosine integrals
cos(nx)*cos(mx)
= 1/2 * [ cos((n+m)*x) + cos((n-m)*x) ]

- Need to choose a monotonic slice since we are going to use substitution X = cos(x)
- For this to work with an integral the mapping between X <-> cos(x) must be reversible
int_0^pi cos(kx) dx = 0 | Integrals of harmonics are 0
I(x) = int_0^pi cos(nx)cos(mx) dx
= int_0^pi 1/2 * [ cos((n+m)*x) + cos((n-m)*x) ] dx
when n != m, I(x) = int_0^pi 1/2 * [ cos(k1*x) + cos(k2*x) ] dx = 0
when n == m, I(x) = int_0^pi 1/2 * [ cos(k1*x) + cos(0) ] dx = int_0^pi 1/2 dx = pi/2
when n == m == 0, I(x) = int_0^pi 1/2 * [cos(0) + cos(0) ] dx = int_0^pi dx = pi

// Expressing cosine integrals in terms of Chebyshev polynomials
Letting X = cos(x)
dX = -sin(x) dx | X^2 = cos^2(x) = 1-sin^2(x) | sin(x) = sqrt(1-X^2)
dx = -1/sqrt(1-X^2) dX
I(x) = int_1^-1 -Tn(X)*Tm(X)/sqrt(1-X^2) dX
I(x) = int_-1^1 Tn(X)*Tm(X)/sqrt(1-X^2) dX
when n != m, I(x) = 0
when n == m, I(x) = pi/2
when n == m == 0, I(x) = pi

// Using closed form integral solutions to represent arbitrary functions as sum of Chebyshev polynomials
- Approximate f(x) as a sum of Chebyshev polynomials
f(x) = sum_n An*Tn(x)
- Expanding the following integral with f(x) shows we can derive An for Tn(x)
I(x) = int_-1^1 f(x)*Tm(x)/sqrt(1-x^2) dx
I(x) = sum_n An int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx | substitute f(x) = sum_n An*Tn(x) | Notice the nested sum
I(x) = sum_n An In(x) | In(x) = int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx
- Substitute our closed form solutions to integrals involving Chebyshev polynomials
when m != n, In(x) = 0 | This property is called orthoganality
when m == n, In(x) = pi/2
when m == n == 0, In(x) = pi
- We can now solve for An for each unique value of n
- Get rid of integrals with Tm(x)Tn(x) where m != n, since this will evaluate to 0
I(x) = An In(x) | In(x) = int_-1^1 Tn(x)Tn(x)/sqrt(1-x^2) dx
- We have In(x) solved and An is what we want
- Solve for I(x) by using initial definition
I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx where m == n | Initial definition
I(x) = An*In(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
where In(x) = pi/2 for n != 0
      In(x) = pi   for n == 0

// How do we calculate I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx ?
- Tm(x) is a Chebyshev polynomial. I.e. Tn(x) = sum_i ai*x^i
- Solve the integral as a sum of small integrals
	- I(x) = sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx
	- |bj-aj| == very small so we can use approximations for fj(x)
- Use a good approximation for fj(x) in the range x in [aj,bj]
  - Zero order hold: fj(x) = [f(aj)+f(bj)]/2
	- Linear fit: fj(x) = m0 + m1*x where fj(aj)=f(aj) and fj(bj)=f(bj)
	- Quadratic fit: fj(x) = m0 + m1*x + m2*x^2
	- Notice all of these approximations are polynomials
- If we approximate fj(x) between [aj,bj] as a polynomial, and Tn(x) is a Chebyshev polynomial
	- Then Gj(x) = fj(x)*Tn(x) will also be a polynomial
	- Gj(x) = sum_i ai*x^i
I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
= sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx | x = [aj,bj] is our region where the polynomial fit fj(x) is made
= sum_j int_aj^bj Gj(x)/sqrt(1-x^2) dx
= sum_j int_aj^bj sum_i ai*x^i/sqrt(1-x^2) dx
= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx | ai is our polynomial coefficient

// Deriving recursive closed form solution to integral of Jn(x) = x^n/sqrt(1-x^2)
Jn(x) = int x^n / sqrt(1-x^2) dx
= int x^(n-1) * x / sqrt(1-x^2) dx
- Using integral by parts: int u dv = uv - int v du
- u = x^(n-1) | du = (n-1)*x^(n-2)
- dv = x / sqrt(1-x^2) | v = -sqrt(1-x^2)
Jn(x) = - x^(n-1) * sqrt(1-x^2) + int sqrt(1-x^2) * (n-1) * x^(n-2) dx
= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * (1-x^2) / sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) - (n-1) * int x^n / sqrt(1-x^2) dx + (n-1) * int x^(n-2) / sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) - (n-1) In(x) + (n-1) J_(n-2)(x)
Jn(x) = 1/n * [ -x^(n-1) * sqrt(1-x^2) + (n-1) * J_(n-2)(x) ]

- Get initial solutions for recursive solution
J0(x) = int 1 / sqrt(1-x^2) dx = arcsin(x)
J1(x) = int x / sqrt(1-x^2) dx = -sqrt(1-x^2)

// Putting all together
An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
An = 1/In(x) * I(x)
where In(x) = pi/2 for n != 0
      In(x) = pi   for n == 0

I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx
= sum_j sum_i ai * Ji(x)
- j = index of x = [aj,bj] where fj(x) approximates f(x)
- i = power of polynomial coefficient in fj(x) = sum_i ai*x^i
	// Compiling with clang or gcc
	// clang main.cpp -o main -march=native -O3 -ffast-math -Wextra -Werror -Wunused-parameter -lm
	// Use "-march=native -ffast-math -O3" for faster gradient descent
	// Use "-lm" if linking to glibc math libraries on Linux
	// Remove "-W*" warning flags if you don't need checks

	#define _USE_MATH_DEFINES
	#include <stdio.h>
	#include <cmath>
	#include <vector>

	#define USE_SIGNAL_HANDLER 1

	#if USE_SIGNAL_HANDLER
	static bool is_running = true;
	#if _WIN32
	#define WIN32_LEAN_AND_MEAN
	#define NOMINMAX
	#include <windows.h>
	BOOL WINAPI sighandler(DWORD signum) {
	if (signum == CTRL_C_EVENT) {
	fprintf(stderr, "Signal caught, exiting!\n");
	is_running = false;
	return TRUE;
	}
	return FALSE;
	}
	#else
	#include <errno.h>
	#include <signal.h>
	static void sighandler(int signum) {
	fprintf(stderr, "Signal caught, exiting! (%d)\n", signum);
	is_running = false;
	}
	#endif
	#endif

	// Configuration
	using grad_t = double;
	using coefficient_t = double;
	constexpr size_t PRINT_ITER = 10'000'000;
	constexpr size_t TOTAL_ITER_ERROR_PLATEAU_THRESHOLD = 10'000;

	// Learning rate update rule when minimum error reaches a plateau
	// - If we update than rescale the learning rate
	// - Otherwise we exit early
	constexpr bool IS_UPDATE_LEARNING_RATE = true;
	constexpr grad_t LEARNING_RATE_RESCALER = 1.1;

	// We are solving for a[n] in g(x) = sum a[n]*x^(2n)
	// Where f(x) = h(x)*g(x), where:
	// - f(x) = target function
	// - h(x) = modifier function to reduce complexity of function g(x) needs to solve
	// - g(x) = polynomial that can be calculated using Horner's method
	// https://en.wikipedia.org/wiki/Horner%27s_method

	// Gradient descent hyperparameters
	// - We are approximating f(x) in the range: x = [X_SAMPLE_START, X_SAMPLE_END] with TOTAL_SAMPLES
	// - g(x) = sum a[n]*x^(2n) for n = [0, TOTAL_COEFFICIENTS-1]
	// - learning rate: INITIAL_LEARNING_RATE
	// - Decrease learning rate if unstable
	// - rescaling gradients for de/da[n] by: x^(n*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS)
	// - NOTE: This drastically improves training speed, you should tweak it
	// - Decrease GRAD_RESCALE_MAGNITUDE if training is unstable
	// - NOTE: You should tweak this value before tweaking learning rate

	// Sample functions to approximate
	#if 1
	// y = sin(2pix)
	template <typename T>
	inline static T get_y_target(T x) {
	return std::sin(T(M_PI)2.0x);
	}
	template <typename T>
	inline static T get_h(T x) {
	// constraint so that roots match sin(2pix)
	constexpr T B0 = T(0.5*0.5);
	const T z = x*x;
	return x*(z-B0);
	}
	constexpr size_t TOTAL_SAMPLES = 256;
	constexpr grad_t X_SAMPLE_START = grad_t(0);
	constexpr grad_t X_SAMPLE_END = grad_t(0.5);
	constexpr size_t TOTAL_COEFFICIENTS = 6;
	constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
	constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
	constexpr grad_t GRAD_RESCALE_BIAS = 1;
	#elif 0
	// y = cos(2pix)
	template <typename T>
	inline static T get_y_target(T x) {
	return std::cos(T(M_PI)2.0x);
	}
	template <typename T>
	inline static T get_h(T x) {
	// constraint so that roots match cos(2pix)
	constexpr T B0 = T(0.25*0.25);
	constexpr T B1 = T(0.75*0.75);
	const T z = x*x;
	return (z-B0)*(z-B1);
	}
	constexpr size_t TOTAL_SAMPLES = 256;
	constexpr grad_t X_SAMPLE_START = grad_t(0);
	constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
	constexpr size_t TOTAL_COEFFICIENTS = 6;
	constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
	constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
	constexpr grad_t GRAD_RESCALE_BIAS = 1;
	#else
	// y = x^2 * exp(pix^2) sin(2pix)
	template <typename T>
	inline static T get_y_target(T x) {
	const T z = x*x;
	return zstd::exp(T(M_PI)z)std::sin(2T(M_PI)*x);
	}
	template <typename T>
	inline static T get_h(T x) {
	constexpr T B0 = T(0.5*0.5);
	const T z = x*x;
	return zx(z-B0);
	}
	constexpr size_t TOTAL_SAMPLES = 256;
	constexpr grad_t X_SAMPLE_START = grad_t(0);
	constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
	constexpr size_t TOTAL_COEFFICIENTS = 8;
	constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
	constexpr grad_t GRAD_RESCALE_MAGNITUDE = 3.6; // needed to be decreased for stable gradient descent
	constexpr grad_t GRAD_RESCALE_BIAS = 1;
	#endif

	inline static coefficient_t get_horner_polynomial(coefficient_t x, coefficient_t* A) {
	// g(x) = sum a[n]*x^(2n)
	// Use Horner's method to calculate this efficiently
	// https://en.wikipedia.org/wiki/Horner%27s_method
	const coefficient_t z = x*x;
	coefficient_t g = A[TOTAL_COEFFICIENTS-1];
	for (size_t i = 1; i < TOTAL_COEFFICIENTS; i++) {
	size_t j = TOTAL_COEFFICIENTS-i-1;
	g = g*z + A[j];
	}
	return g;
	}

	int main(int /argc/, char** /argv/) {
	#if USE_SIGNAL_HANDLER
	#if _WIN32
	SetConsoleCtrlHandler(sighandler, TRUE);
	#else
	struct sigaction sigact;
	sigact.sa_handler = sighandler;
	sigemptyset(&sigact.sa_mask);
	sigact.sa_flags = 0;
	sigaction(SIGINT, &sigact, nullptr);
	sigaction(SIGTERM, &sigact, nullptr);
	sigaction(SIGQUIT, &sigact, nullptr);
	sigaction(SIGPIPE, &sigact, nullptr);
	#endif
	#endif
	coefficient_t poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};

	static_assert(X_SAMPLE_END > X_SAMPLE_START, "Range of evaluation must be from x_min to x_max");
	constexpr grad_t EVAL_STEP = (X_SAMPLE_END-X_SAMPLE_START)/grad_t(TOTAL_SAMPLES);
	auto X_in = std::vector<grad_t>(TOTAL_SAMPLES);
	auto Y_expected = std::vector<grad_t>(TOTAL_SAMPLES);
	auto H_in = std::vector<grad_t>(TOTAL_SAMPLES);
	for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
	const grad_t x = X_SAMPLE_START + grad_t(i)*EVAL_STEP;
	const grad_t y = get_y_target(x);
	const grad_t h = get_h(x);
	X_in[i] = x;
	Y_expected[i] = y;
	H_in[i] = h;
	}

	// normalised gradients for each term
	// de/da[n] = 2(f-y)h*x^2n
	// = 2e(x)h*x^2n
	// Assume e(x) converges to 0, then e_max = e^2n
	// de/da[n] = 2hx^4n
	// In reality the x^4n factor is approximated as x^(an+b)
	// where a,b are hyperparameters
	grad_t gradient_scale[TOTAL_COEFFICIENTS] = {0.0};
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	grad_t scale = 0.0;
	for (size_t j = 0; j < TOTAL_SAMPLES; j++) {
	const grad_t x = grad_t(X_in[j]);
	const grad_t h = grad_t(H_in[j]);
	const grad_t n = grad_t(i)*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS;
	const grad_t x_n = std::pow(x, n);
	scale += (h*x_n);
	}
	scale = std::abs(scale);
	scale /= grad_t(TOTAL_SAMPLES);
	gradient_scale[i] = grad_t(1) / scale;
	}
	printf("grad_rescale = [");
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	printf("%.2e,", gradient_scale[i]);
	}
	printf("]\n");
	printf("is_update_learning_rate: %u\n", IS_UPDATE_LEARNING_RATE);

	constexpr grad_t NORM_SAMPLES = 1.0f / grad_t(TOTAL_SAMPLES);
	grad_t learning_rate = INITIAL_LEARNING_RATE;
	coefficient_t best_poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};
	grad_t error_minimum = grad_t(1e6);
	size_t error_minimum_iter = 0;
	size_t total_iter_error_plateau = 0;
	for (size_t iter = 0; ; iter++) {
	grad_t total_error = grad_t(0);
	grad_t gradients[TOTAL_COEFFICIENTS] = {grad_t(0)};
	for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
	const coefficient_t x = X_in[i];
	const coefficient_t y_given = coefficient_t(H_in[i])*get_horner_polynomial(coefficient_t(x), poly);
	const grad_t y_expected = Y_expected[i];
	const grad_t error = grad_t(y_given) - y_expected;
	// gradient descent
	// g(x) = sum a[n]*x^(2n)
	// f(x) = g(x)*h(x)
	// e(x) = [f(x) - y]^2
	// de/df = 2*(f-y)
	// df/dg = h
	// de/dg = de/df * df/dg, chain rule
	// de/dg = 2(f-y)h
	// de/da[n] = de/dg * dg/da[n]
	// dg/da[n] = x^(2n)
	// de/da[n] = 2(f-y)h*x^2n
	const grad_t z = x*x;
	const grad_t h = H_in[i];
	grad_t gradient = errorhgrad_t(2);
	gradients[0] += gradient;
	for (size_t j = 1; j < TOTAL_COEFFICIENTS; j++) {
	gradient *= z;
	gradients[j] += gradient;
	}
	total_error += std::abs(error);
	}

	// normalised error and gradient
	total_error *= NORM_SAMPLES;
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	const grad_t scale = NORM_SAMPLES * gradient_scale[i];
	gradients[i] *= scale;
	}

	// save lowest error polynomial
	if (total_error < error_minimum && !std::isnan(total_error)) {
	error_minimum = total_error;
	error_minimum_iter = iter;
	total_iter_error_plateau = 0;
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	best_poly[i] = poly[i];
	}
	} else {
	total_iter_error_plateau++;
	}

	if (iter % PRINT_ITER == 0) {
	printf("[%zu] error=%.2e, grad=[", iter, total_error);
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	printf("%.2e,", gradients[i]);
	}
	printf("]\n");
	}

	// apply gradients
	bool has_nan = false;
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	const grad_t delta = gradients[i] * learning_rate;
	const grad_t coeff = grad_t(poly[i]) - delta;
	poly[i] = coefficient_t(coeff);
	if (std::abs(coeff) > 1e5f) {
	printf("[%zu] [error] Detected exploding coefficient exiting\n", iter);
	has_nan = true;
	break;
	}
	if (std::isnan(coeff) \|\| std::isinf(coeff)) {
	printf("[%zu] [error] Detected NAN in coefficient exiting\n", iter);
	has_nan = true;
	break;
	}
	}
	if (has_nan) {
	printf("[%zu] [error] Early exit due to bad values, printing them now\n", iter);
	printf(" learning_rate: %.2e\n", learning_rate);
	printf(" grad: [");
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	printf("%.2e,", gradients[i]);
	}
	printf("]\n");
	printf(" poly: [");
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	printf("%.2e,", poly[i]);
	}
	printf("]\n");
	break;
	}
	if (total_iter_error_plateau >= TOTAL_ITER_ERROR_PLATEAU_THRESHOLD) {
	if (IS_UPDATE_LEARNING_RATE) {
	total_iter_error_plateau = 0;
	learning_rate *= LEARNING_RATE_RESCALER;
	} else {
	printf("[%zu] [info] Exiting since error has plateaued after %zu iters\n",
	iter, TOTAL_ITER_ERROR_PLATEAU_THRESHOLD);
	break;
	}
	}
	if (!is_running) break;
	}
	printf("\n[BEST RESULTS]\n");
	printf("step: %zu\n", error_minimum_iter);
	printf("mean_absolute_error: %.2e\n", error_minimum);
	printf("poly[%zu] = {", TOTAL_COEFFICIENTS);
	for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
	constexpr size_t total_dp = (sizeof(coefficient_t) == 4) ? 10 : 16;
	printf("%.*f", int(total_dp), best_poly[i]);
	if (i != TOTAL_COEFFICIENTS-1) {
	printf(",");
	}
	}
	printf("}\n");
	return 0;
	}
	// Definition of Chebyshev polynomial
	Tn(cos(x)) = cos(nx)
	T0(cos(x)) = cos(0x) = 1
	T1(cos(x)) = cos(x)
	T2(cos(x)) = cos(2x) = 2cos(x)^2 - 1
	Let X = cos(x)
	- T0(X) = 1
	- T1(X) = X
	- T2(X) = 2X^2 - 1

	// Deriving trigonometric identities
	e^j(a+b) = e^ja * e^jb
	cis(a+b) = cis(a) * cis(b)
	cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
	sin(a+b) = cos(a)sin(b) + sin(a)cos(b)

	e^j(a-b) = e^ja * e^j(-b)
	cis(a-b) = cis(a)*cis(-b)
	cos(a-b) = cos(a)cos(b) + sin(a)sin(b)
	sin(a-b) = -cos(a)sin(b) + sin(a)cos(b)

	cos(a+b) + cos(a-b) = 2cos(a)cos(b)
	cos(a)cos(b) = 1/2 * [ cos(a+b) + cos(a-b) ]

	cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)

	// Derive recursive definition for Chebyshev polynomials using trigonometric identities
	T_(n+1)(cos(x)) = cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)
	T_(n+1)(X) = X*Tn(X) - sin(x)sin(nx)

	T_(n-1)(cos(x)) = cos(-x)cos(nx) - sin(-x)sin(nx)
	T_(n-1)(X) = X*T_n(X) + sin(x)sin(nx)

	T_(n+1)(X) + T_(n-1)(X) = 2XT_n(X)
	T_(n+1)(X) = 2XT_n(X) - T_(n-1)(X)

	T3(X) = 2X*T2(X) - T1(X)
	T3(X) = 2X*(2X^2-1) - X
	T3(X) = 4X^3 - 3X

	// Derivation of closed form solutions to cosine integrals
	cos(nx)*cos(mx)
	= 1/2 * [ cos((n+m)x) + cos((n-m)x) ]

	- Need to choose a monotonic slice since we are going to use substitution X = cos(x)
	- For this to work with an integral the mapping between X <-> cos(x) must be reversible
	int_0^pi cos(kx) dx = 0 \| Integrals of harmonics are 0
	I(x) = int_0^pi cos(nx)cos(mx) dx
	= int_0^pi 1/2 * [ cos((n+m)x) + cos((n-m)x) ] dx
	when n != m, I(x) = int_0^pi 1/2 * [ cos(k1x) + cos(k2x) ] dx = 0
	when n == m, I(x) = int_0^pi 1/2 * [ cos(k1*x) + cos(0) ] dx = int_0^pi 1/2 dx = pi/2
	when n == m == 0, I(x) = int_0^pi 1/2 * [cos(0) + cos(0) ] dx = int_0^pi dx = pi

	// Expressing cosine integrals in terms of Chebyshev polynomials
	Letting X = cos(x)
	dX = -sin(x) dx \| X^2 = cos^2(x) = 1-sin^2(x) \| sin(x) = sqrt(1-X^2)
	dx = -1/sqrt(1-X^2) dX
	I(x) = int_1^-1 -Tn(X)*Tm(X)/sqrt(1-X^2) dX
	I(x) = int_-1^1 Tn(X)*Tm(X)/sqrt(1-X^2) dX
	when n != m, I(x) = 0
	when n == m, I(x) = pi/2
	when n == m == 0, I(x) = pi

	// Using closed form integral solutions to represent arbitrary functions as sum of Chebyshev polynomials
	- Approximate f(x) as a sum of Chebyshev polynomials
	f(x) = sum_n An*Tn(x)
	- Expanding the following integral with f(x) shows we can derive An for Tn(x)
	I(x) = int_-1^1 f(x)*Tm(x)/sqrt(1-x^2) dx
	I(x) = sum_n An int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx \| substitute f(x) = sum_n An*Tn(x) \| Notice the nested sum
	I(x) = sum_n An In(x) \| In(x) = int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx
	- Substitute our closed form solutions to integrals involving Chebyshev polynomials
	when m != n, In(x) = 0 \| This property is called orthoganality
	when m == n, In(x) = pi/2
	when m == n == 0, In(x) = pi
	- We can now solve for An for each unique value of n
	- Get rid of integrals with Tm(x)Tn(x) where m != n, since this will evaluate to 0
	I(x) = An In(x) \| In(x) = int_-1^1 Tn(x)Tn(x)/sqrt(1-x^2) dx
	- We have In(x) solved and An is what we want
	- Solve for I(x) by using initial definition
	I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx where m == n \| Initial definition
	I(x) = AnIn(x) = int_-1^1 f(x)Tn(x)/sqrt(1-x^2) dx
	An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
	where In(x) = pi/2 for n != 0
	In(x) = pi for n == 0

	// How do we calculate I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx ?
	- Tm(x) is a Chebyshev polynomial. I.e. Tn(x) = sum_i ai*x^i
	- Solve the integral as a sum of small integrals
	- I(x) = sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx
	- \|bj-aj\| == very small so we can use approximations for fj(x)
	- Use a good approximation for fj(x) in the range x in [aj,bj]
	- Zero order hold: fj(x) = [f(aj)+f(bj)]/2
	- Linear fit: fj(x) = m0 + m1*x where fj(aj)=f(aj) and fj(bj)=f(bj)
	- Quadratic fit: fj(x) = m0 + m1x + m2x^2
	- Notice all of these approximations are polynomials
	- If we approximate fj(x) between [aj,bj] as a polynomial, and Tn(x) is a Chebyshev polynomial
	- Then Gj(x) = fj(x)*Tn(x) will also be a polynomial
	- Gj(x) = sum_i ai*x^i
	I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
	= sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx \| x = [aj,bj] is our region where the polynomial fit fj(x) is made
	= sum_j int_aj^bj Gj(x)/sqrt(1-x^2) dx
	= sum_j int_aj^bj sum_i ai*x^i/sqrt(1-x^2) dx
	= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx \| ai is our polynomial coefficient

	// Deriving recursive closed form solution to integral of Jn(x) = x^n/sqrt(1-x^2)
	Jn(x) = int x^n / sqrt(1-x^2) dx
	= int x^(n-1) * x / sqrt(1-x^2) dx
	- Using integral by parts: int u dv = uv - int v du
	- u = x^(n-1) \| du = (n-1)*x^(n-2)
	- dv = x / sqrt(1-x^2) \| v = -sqrt(1-x^2)
	Jn(x) = - x^(n-1) * sqrt(1-x^2) + int sqrt(1-x^2) * (n-1) * x^(n-2) dx
	= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * sqrt(1-x^2) dx
	= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * (1-x^2) / sqrt(1-x^2) dx
	= -x^(n-1) * sqrt(1-x^2) - (n-1) * int x^n / sqrt(1-x^2) dx + (n-1) * int x^(n-2) / sqrt(1-x^2) dx
	= -x^(n-1) * sqrt(1-x^2) - (n-1) In(x) + (n-1) J_(n-2)(x)
	Jn(x) = 1/n * [ -x^(n-1) * sqrt(1-x^2) + (n-1) * J_(n-2)(x) ]

	- Get initial solutions for recursive solution
	J0(x) = int 1 / sqrt(1-x^2) dx = arcsin(x)
	J1(x) = int x / sqrt(1-x^2) dx = -sqrt(1-x^2)

	// Putting all together
	An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
	An = 1/In(x) * I(x)
	where In(x) = pi/2 for n != 0
	In(x) = pi for n == 0

	I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
	= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx
	= sum_j sum_i ai * Ji(x)
	- j = index of x = [aj,bj] where fj(x) approximates f(x)
	- i = power of polynomial coefficient in fj(x) = sum_i ai*x^i
N	MAE	Coefficients
6	`7.07e-10`	`{-25.1327411142213464,64.8358266034100694,-67.0768851968012996,38.4999814590310407,-14.0736995967404166,3.2086237284325541}`
5	`6.01e-8`	`{-25.1327327703024501,64.8348800694709126,-67.0481124846729131,38.1547044862267200,-12.3089616248433558}`
N	MAE	Coefficients
6	`2.47e-8`	`{-25.1327419281,64.8358230591,-67.0768585205,38.5037078857,-14.1102266312,3.2983388901}`
5	`6.32e-8`	`{-25.1327323914,64.8348770142,-67.0481109619,38.1546096802,-12.3083391190}`