Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save williamyang98/7aca0ca0f1978c7374a66002892e0d8a to your computer and use it in GitHub Desktop.
Save williamyang98/7aca0ca0f1978c7374a66002892e0d8a to your computer and use it in GitHub Desktop.
Calculate chebyshev polynomial that will approximate f(x) = sin(kx). where k = π/root. Uses gradient descent.
// Compiling with clang or gcc
// clang main.cpp -o main -march=native -O3 -ffast-math -Wextra -Werror -Wunused-parameter -lm
// Use "-march=native -ffast-math -O3" for faster gradient descent
// Use "-lm" if linking to glibc math libraries on Linux
// Remove "-W*" warning flags if you don't need checks
#define _USE_MATH_DEFINES
#include <stdio.h>
#include <cmath>
#include <vector>
#define USE_SIGNAL_HANDLER 1
#if USE_SIGNAL_HANDLER
static bool is_running = true;
#if _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
BOOL WINAPI sighandler(DWORD signum) {
if (signum == CTRL_C_EVENT) {
fprintf(stderr, "Signal caught, exiting!\n");
is_running = false;
return TRUE;
}
return FALSE;
}
#else
#include <errno.h>
#include <signal.h>
static void sighandler(int signum) {
fprintf(stderr, "Signal caught, exiting! (%d)\n", signum);
is_running = false;
}
#endif
#endif
// Configuration
using grad_t = double;
using coefficient_t = double;
constexpr size_t PRINT_ITER = 10'000'000;
constexpr size_t TOTAL_ITER_ERROR_PLATEAU_THRESHOLD = 10'000;
// Learning rate update rule when minimum error reaches a plateau
// - If we update than rescale the learning rate
// - Otherwise we exit early
constexpr bool IS_UPDATE_LEARNING_RATE = true;
constexpr grad_t LEARNING_RATE_RESCALER = 1.1;
// We are solving for a[n] in g(x) = sum a[n]*x^(2n)
// Where f(x) = h(x)*g(x), where:
// - f(x) = target function
// - h(x) = modifier function to reduce complexity of function g(x) needs to solve
// - g(x) = polynomial that can be calculated using Horner's method
// https://en.wikipedia.org/wiki/Horner%27s_method
// Gradient descent hyperparameters
// - We are approximating f(x) in the range: x = [X_SAMPLE_START, X_SAMPLE_END] with TOTAL_SAMPLES
// - g(x) = sum a[n]*x^(2n) for n = [0, TOTAL_COEFFICIENTS-1]
// - learning rate: INITIAL_LEARNING_RATE
// - Decrease learning rate if unstable
// - rescaling gradients for de/da[n] by: x^(n*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS)
// - NOTE: This drastically improves training speed, you should tweak it
// - Decrease GRAD_RESCALE_MAGNITUDE if training is unstable
// - NOTE: You should tweak this value before tweaking learning rate
// Sample functions to approximate
#if 1
// y = sin(2*pi*x)
template <typename T>
inline static T get_y_target(T x) {
return std::sin(T(M_PI)*2.0*x);
}
template <typename T>
inline static T get_h(T x) {
// constraint so that roots match sin(2*pi*x)
constexpr T B0 = T(0.5*0.5);
const T z = x*x;
return x*(z-B0);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5);
constexpr size_t TOTAL_COEFFICIENTS = 6;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#elif 0
// y = cos(2*pi*x)
template <typename T>
inline static T get_y_target(T x) {
return std::cos(T(M_PI)*2.0*x);
}
template <typename T>
inline static T get_h(T x) {
// constraint so that roots match cos(2*pi*x)
constexpr T B0 = T(0.25*0.25);
constexpr T B1 = T(0.75*0.75);
const T z = x*x;
return (z-B0)*(z-B1);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
constexpr size_t TOTAL_COEFFICIENTS = 6;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 4;
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#else
// y = x^2 * exp(pi*x^2) * sin(2pix)
template <typename T>
inline static T get_y_target(T x) {
const T z = x*x;
return z*std::exp(T(M_PI)*z)*std::sin(2*T(M_PI)*x);
}
template <typename T>
inline static T get_h(T x) {
constexpr T B0 = T(0.5*0.5);
const T z = x*x;
return z*x*(z-B0);
}
constexpr size_t TOTAL_SAMPLES = 256;
constexpr grad_t X_SAMPLE_START = grad_t(0);
constexpr grad_t X_SAMPLE_END = grad_t(0.5 + 1e-2);
constexpr size_t TOTAL_COEFFICIENTS = 8;
constexpr grad_t INITIAL_LEARNING_RATE = grad_t(1e0);
constexpr grad_t GRAD_RESCALE_MAGNITUDE = 3.6; // needed to be decreased for stable gradient descent
constexpr grad_t GRAD_RESCALE_BIAS = 1;
#endif
inline static coefficient_t get_horner_polynomial(coefficient_t x, coefficient_t* A) {
// g(x) = sum a[n]*x^(2n)
// Use Horner's method to calculate this efficiently
// https://en.wikipedia.org/wiki/Horner%27s_method
const coefficient_t z = x*x;
coefficient_t g = A[TOTAL_COEFFICIENTS-1];
for (size_t i = 1; i < TOTAL_COEFFICIENTS; i++) {
size_t j = TOTAL_COEFFICIENTS-i-1;
g = g*z + A[j];
}
return g;
}
int main(int /*argc*/, char** /*argv*/) {
#if USE_SIGNAL_HANDLER
#if _WIN32
SetConsoleCtrlHandler(sighandler, TRUE);
#else
struct sigaction sigact;
sigact.sa_handler = sighandler;
sigemptyset(&sigact.sa_mask);
sigact.sa_flags = 0;
sigaction(SIGINT, &sigact, nullptr);
sigaction(SIGTERM, &sigact, nullptr);
sigaction(SIGQUIT, &sigact, nullptr);
sigaction(SIGPIPE, &sigact, nullptr);
#endif
#endif
coefficient_t poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};
static_assert(X_SAMPLE_END > X_SAMPLE_START, "Range of evaluation must be from x_min to x_max");
constexpr grad_t EVAL_STEP = (X_SAMPLE_END-X_SAMPLE_START)/grad_t(TOTAL_SAMPLES);
auto X_in = std::vector<grad_t>(TOTAL_SAMPLES);
auto Y_expected = std::vector<grad_t>(TOTAL_SAMPLES);
auto H_in = std::vector<grad_t>(TOTAL_SAMPLES);
for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
const grad_t x = X_SAMPLE_START + grad_t(i)*EVAL_STEP;
const grad_t y = get_y_target(x);
const grad_t h = get_h(x);
X_in[i] = x;
Y_expected[i] = y;
H_in[i] = h;
}
// normalised gradients for each term
// de/da[n] = 2*(f-y)*h*x^2n
// = 2*e(x)*h*x^2n
// Assume e(x) converges to 0, then e_max = e^2n
// de/da[n] = 2*h*x^4n
// In reality the x^4n factor is approximated as x^(an+b)
// where a,b are hyperparameters
grad_t gradient_scale[TOTAL_COEFFICIENTS] = {0.0};
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
grad_t scale = 0.0;
for (size_t j = 0; j < TOTAL_SAMPLES; j++) {
const grad_t x = grad_t(X_in[j]);
const grad_t h = grad_t(H_in[j]);
const grad_t n = grad_t(i)*GRAD_RESCALE_MAGNITUDE + GRAD_RESCALE_BIAS;
const grad_t x_n = std::pow(x, n);
scale += (h*x_n);
}
scale = std::abs(scale);
scale /= grad_t(TOTAL_SAMPLES);
gradient_scale[i] = grad_t(1) / scale;
}
printf("grad_rescale = [");
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
printf("%.2e,", gradient_scale[i]);
}
printf("]\n");
printf("is_update_learning_rate: %u\n", IS_UPDATE_LEARNING_RATE);
constexpr grad_t NORM_SAMPLES = 1.0f / grad_t(TOTAL_SAMPLES);
grad_t learning_rate = INITIAL_LEARNING_RATE;
coefficient_t best_poly[TOTAL_COEFFICIENTS] = {coefficient_t(0)};
grad_t error_minimum = grad_t(1e6);
size_t error_minimum_iter = 0;
size_t total_iter_error_plateau = 0;
for (size_t iter = 0; ; iter++) {
grad_t total_error = grad_t(0);
grad_t gradients[TOTAL_COEFFICIENTS] = {grad_t(0)};
for (size_t i = 0; i < TOTAL_SAMPLES; i++) {
const coefficient_t x = X_in[i];
const coefficient_t y_given = coefficient_t(H_in[i])*get_horner_polynomial(coefficient_t(x), poly);
const grad_t y_expected = Y_expected[i];
const grad_t error = grad_t(y_given) - y_expected;
// gradient descent
// g(x) = sum a[n]*x^(2n)
// f(x) = g(x)*h(x)
// e(x) = [f(x) - y]^2
// de/df = 2*(f-y)
// df/dg = h
// de/dg = de/df * df/dg, chain rule
// de/dg = 2*(f-y)*h
// de/da[n] = de/dg * dg/da[n]
// dg/da[n] = x^(2n)
// de/da[n] = 2*(f-y)*h*x^2n
const grad_t z = x*x;
const grad_t h = H_in[i];
grad_t gradient = error*h*grad_t(2);
gradients[0] += gradient;
for (size_t j = 1; j < TOTAL_COEFFICIENTS; j++) {
gradient *= z;
gradients[j] += gradient;
}
total_error += std::abs(error);
}
// normalised error and gradient
total_error *= NORM_SAMPLES;
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
const grad_t scale = NORM_SAMPLES * gradient_scale[i];
gradients[i] *= scale;
}
// save lowest error polynomial
if (total_error < error_minimum && !std::isnan(total_error)) {
error_minimum = total_error;
error_minimum_iter = iter;
total_iter_error_plateau = 0;
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
best_poly[i] = poly[i];
}
} else {
total_iter_error_plateau++;
}
if (iter % PRINT_ITER == 0) {
printf("[%zu] error=%.2e, grad=[", iter, total_error);
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
printf("%.2e,", gradients[i]);
}
printf("]\n");
}
// apply gradients
bool has_nan = false;
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
const grad_t delta = gradients[i] * learning_rate;
const grad_t coeff = grad_t(poly[i]) - delta;
poly[i] = coefficient_t(coeff);
if (std::abs(coeff) > 1e5f) {
printf("[%zu] [error] Detected exploding coefficient exiting\n", iter);
has_nan = true;
break;
}
if (std::isnan(coeff) || std::isinf(coeff)) {
printf("[%zu] [error] Detected NAN in coefficient exiting\n", iter);
has_nan = true;
break;
}
}
if (has_nan) {
printf("[%zu] [error] Early exit due to bad values, printing them now\n", iter);
printf(" learning_rate: %.2e\n", learning_rate);
printf(" grad: [");
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
printf("%.2e,", gradients[i]);
}
printf("]\n");
printf(" poly: [");
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
printf("%.2e,", poly[i]);
}
printf("]\n");
break;
}
if (total_iter_error_plateau >= TOTAL_ITER_ERROR_PLATEAU_THRESHOLD) {
if (IS_UPDATE_LEARNING_RATE) {
total_iter_error_plateau = 0;
learning_rate *= LEARNING_RATE_RESCALER;
} else {
printf("[%zu] [info] Exiting since error has plateaued after %zu iters\n",
iter, TOTAL_ITER_ERROR_PLATEAU_THRESHOLD);
break;
}
}
if (!is_running) break;
}
printf("\n[BEST RESULTS]\n");
printf("step: %zu\n", error_minimum_iter);
printf("mean_absolute_error: %.2e\n", error_minimum);
printf("poly[%zu] = {", TOTAL_COEFFICIENTS);
for (size_t i = 0; i < TOTAL_COEFFICIENTS; i++) {
constexpr size_t total_dp = (sizeof(coefficient_t) == 4) ? 10 : 16;
printf("%.*f", int(total_dp), best_poly[i]);
if (i != TOTAL_COEFFICIENTS-1) {
printf(",");
}
}
printf("}\n");
return 0;
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
// Definition of Chebyshev polynomial
Tn(cos(x)) = cos(nx)
T0(cos(x)) = cos(0x) = 1
T1(cos(x)) = cos(x)
T2(cos(x)) = cos(2x) = 2cos(x)^2 - 1
Let X = cos(x)
- T0(X) = 1
- T1(X) = X
- T2(X) = 2X^2 - 1
// Deriving trigonometric identities
e^j(a+b) = e^ja * e^jb
cis(a+b) = cis(a) * cis(b)
cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
sin(a+b) = cos(a)sin(b) + sin(a)cos(b)
e^j(a-b) = e^ja * e^j(-b)
cis(a-b) = cis(a)*cis(-b)
cos(a-b) = cos(a)cos(b) + sin(a)sin(b)
sin(a-b) = -cos(a)sin(b) + sin(a)cos(b)
cos(a+b) + cos(a-b) = 2cos(a)cos(b)
cos(a)cos(b) = 1/2 * [ cos(a+b) + cos(a-b) ]
cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)
// Derive recursive definition for Chebyshev polynomials using trigonometric identities
T_(n+1)(cos(x)) = cos((n+1)x) = cos(x)cos(nx) - sin(x)sin(nx)
T_(n+1)(X) = X*Tn(X) - sin(x)sin(nx)
T_(n-1)(cos(x)) = cos(-x)cos(nx) - sin(-x)sin(nx)
T_(n-1)(X) = X*T_n(X) + sin(x)sin(nx)
T_(n+1)(X) + T_(n-1)(X) = 2*X*T_n(X)
T_(n+1)(X) = 2*X*T_n(X) - T_(n-1)(X)
T3(X) = 2X*T2(X) - T1(X)
T3(X) = 2X*(2X^2-1) - X
T3(X) = 4X^3 - 3X
// Derivation of closed form solutions to cosine integrals
cos(nx)*cos(mx)
= 1/2 * [ cos((n+m)*x) + cos((n-m)*x) ]
- Need to choose a monotonic slice since we are going to use substitution X = cos(x)
- For this to work with an integral the mapping between X <-> cos(x) must be reversible
int_0^pi cos(kx) dx = 0 | Integrals of harmonics are 0
I(x) = int_0^pi cos(nx)cos(mx) dx
= int_0^pi 1/2 * [ cos((n+m)*x) + cos((n-m)*x) ] dx
when n != m, I(x) = int_0^pi 1/2 * [ cos(k1*x) + cos(k2*x) ] dx = 0
when n == m, I(x) = int_0^pi 1/2 * [ cos(k1*x) + cos(0) ] dx = int_0^pi 1/2 dx = pi/2
when n == m == 0, I(x) = int_0^pi 1/2 * [cos(0) + cos(0) ] dx = int_0^pi dx = pi
// Expressing cosine integrals in terms of Chebyshev polynomials
Letting X = cos(x)
dX = -sin(x) dx | X^2 = cos^2(x) = 1-sin^2(x) | sin(x) = sqrt(1-X^2)
dx = -1/sqrt(1-X^2) dX
I(x) = int_1^-1 -Tn(X)*Tm(X)/sqrt(1-X^2) dX
I(x) = int_-1^1 Tn(X)*Tm(X)/sqrt(1-X^2) dX
when n != m, I(x) = 0
when n == m, I(x) = pi/2
when n == m == 0, I(x) = pi
// Using closed form integral solutions to represent arbitrary functions as sum of Chebyshev polynomials
- Approximate f(x) as a sum of Chebyshev polynomials
f(x) = sum_n An*Tn(x)
- Expanding the following integral with f(x) shows we can derive An for Tn(x)
I(x) = int_-1^1 f(x)*Tm(x)/sqrt(1-x^2) dx
I(x) = sum_n An int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx | substitute f(x) = sum_n An*Tn(x) | Notice the nested sum
I(x) = sum_n An In(x) | In(x) = int_-1^1 Tm(x)Tn(x)/sqrt(1-x^2) dx
- Substitute our closed form solutions to integrals involving Chebyshev polynomials
when m != n, In(x) = 0 | This property is called orthoganality
when m == n, In(x) = pi/2
when m == n == 0, In(x) = pi
- We can now solve for An for each unique value of n
- Get rid of integrals with Tm(x)Tn(x) where m != n, since this will evaluate to 0
I(x) = An In(x) | In(x) = int_-1^1 Tn(x)Tn(x)/sqrt(1-x^2) dx
- We have In(x) solved and An is what we want
- Solve for I(x) by using initial definition
I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx where m == n | Initial definition
I(x) = An*In(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
where In(x) = pi/2 for n != 0
In(x) = pi for n == 0
// How do we calculate I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx ?
- Tm(x) is a Chebyshev polynomial. I.e. Tn(x) = sum_i ai*x^i
- Solve the integral as a sum of small integrals
- I(x) = sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx
- |bj-aj| == very small so we can use approximations for fj(x)
- Use a good approximation for fj(x) in the range x in [aj,bj]
- Zero order hold: fj(x) = [f(aj)+f(bj)]/2
- Linear fit: fj(x) = m0 + m1*x where fj(aj)=f(aj) and fj(bj)=f(bj)
- Quadratic fit: fj(x) = m0 + m1*x + m2*x^2
- Notice all of these approximations are polynomials
- If we approximate fj(x) between [aj,bj] as a polynomial, and Tn(x) is a Chebyshev polynomial
- Then Gj(x) = fj(x)*Tn(x) will also be a polynomial
- Gj(x) = sum_i ai*x^i
I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
= sum_j int_aj^bj fj(x)*Tn(x)/sqrt(1-x^2) dx | x = [aj,bj] is our region where the polynomial fit fj(x) is made
= sum_j int_aj^bj Gj(x)/sqrt(1-x^2) dx
= sum_j int_aj^bj sum_i ai*x^i/sqrt(1-x^2) dx
= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx | ai is our polynomial coefficient
// Deriving recursive closed form solution to integral of Jn(x) = x^n/sqrt(1-x^2)
Jn(x) = int x^n / sqrt(1-x^2) dx
= int x^(n-1) * x / sqrt(1-x^2) dx
- Using integral by parts: int u dv = uv - int v du
- u = x^(n-1) | du = (n-1)*x^(n-2)
- dv = x / sqrt(1-x^2) | v = -sqrt(1-x^2)
Jn(x) = - x^(n-1) * sqrt(1-x^2) + int sqrt(1-x^2) * (n-1) * x^(n-2) dx
= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) + (n-1) * int x^(n-2) * (1-x^2) / sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) - (n-1) * int x^n / sqrt(1-x^2) dx + (n-1) * int x^(n-2) / sqrt(1-x^2) dx
= -x^(n-1) * sqrt(1-x^2) - (n-1) In(x) + (n-1) J_(n-2)(x)
Jn(x) = 1/n * [ -x^(n-1) * sqrt(1-x^2) + (n-1) * J_(n-2)(x) ]
- Get initial solutions for recursive solution
J0(x) = int 1 / sqrt(1-x^2) dx = arcsin(x)
J1(x) = int x / sqrt(1-x^2) dx = -sqrt(1-x^2)
// Putting all together
An = 1/In(x) * int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
An = 1/In(x) * I(x)
where In(x) = pi/2 for n != 0
In(x) = pi for n == 0
I(x) = int_-1^1 f(x)*Tn(x)/sqrt(1-x^2) dx
= sum_j sum_i ai * int_a^b x^i/sqrt(1-x^2) dx
= sum_j sum_i ai * Ji(x)
- j = index of x = [aj,bj] where fj(x) approximates f(x)
- i = power of polynomial coefficient in fj(x) = sum_i ai*x^i
@williamyang98
Copy link
Author

williamyang98 commented Jan 23, 2024

Gradient descent or direct computation

  • Directly computing Chebyshev polynomials gives a close result where the error decreases as number of Chebyshev polynomials increases.
  • We can improve on the direct computation by using gradient descent, which for the same order of polynomial gives a lower error.
  • The analytically derived result and gradient descent result produces similar polynomials, but the gradient descent version has a lower error.

Using gradient descent solver

  • Use doubles for coefficients and gradients for best approximation.
  • Increasing number of samples improves error more equally within roots, however this has diminishing returns past a certain sample count.
  • Using roots=[-0.5,0,+0.5] means we can use x = x - round(x) to keep f(x) = sin(2πx) within valid approximation range.
    • _mm_round_ps and _mm256_round_ps are x86 simd instructions so we can vectorise input clamping.
    • Much cheaper than arbitary floating point modulus.
  • Converting polynomial coefficients from double to float will introduce error.
    • We can change coefficient_t to float in solver and use the initial double solution as our initial poly[] = {}.
    • Keep grad_t as double.
    • Running the solver will give optimised values for float coefficients.

Calculated polynomials for f(x) = sin(2πx) and its mean absolute error

Using f64 for coefficients and gradient with 1024 samples.

N MAE Coefficients
6 7.07e-10 {-25.1327411142213464,64.8358266034100694,-67.0768851968012996,38.4999814590310407,-14.0736995967404166,3.2086237284325541}
5 6.01e-8 {-25.1327327703024501,64.8348800694709126,-67.0481124846729131,38.1547044862267200,-12.3089616248433558}

Using f32 for coefficients with initial estimate from f64 solution.

N MAE Coefficients
6 2.47e-8 {-25.1327419281,64.8358230591,-67.0768585205,38.5037078857,-14.1102266312,3.2983388901}
5 6.32e-8 {-25.1327323914,64.8348770142,-67.0481109619,38.1546096802,-12.3083391190}

Polynomials from original article

  • https://mooooo.ooo/chebyshev-sine-approximation
  • poly[6] = {-25.1327419281,64.8358230591,-67.0766296387,38.495880127,-14.0496635437,3.16160202026} @ 2.98e-8
  • Original article uses analytical solution then brute forces to minimize error.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment