Skip to content

Instantly share code, notes, and snippets.

@fredrik-johansson
Created March 7, 2024 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fredrik-johansson/c1ab3ce215e1cf80115c422a103cf31f to your computer and use it in GitHub Desktop.
Save fredrik-johansson/c1ab3ce215e1cf80115c422a103cf31f to your computer and use it in GitHub Desktop.
Taylor experiment
#include <math.h>
#include "longlong.h"
#include "ulong_extras.h"
#include "mpn_extras.h"
#include "arf.h"
#include "arb.h"
#include "profiler.h"
/* util: print fixed-point number given by n limbs where fracn limbs are fractional. */
void
flint_mpn_print_fixed(mp_srcptr x, mp_size_t n, mp_size_t fracn)
{
if (n == 0)
{
flint_printf("0");
}
else
{
arf_t y;
fmpz_t t;
fmpz_init(t);
slong d, fracd, dmag;
arf_init(y);
fmpz_init(t);
fmpz_set_ui_array(t, x, n);
arf_set_fmpz(y, t);
arf_mul_2exp_si(y, y, -fracn * FLINT_BITS);
dmag = log10(arf_get_d(y, ARF_RND_DOWN));
fracd = fracn * (FLINT_BITS * 0.3010299956639812);
d = FLINT_MAX(1, dmag + fracd);
arf_printd(y, d);
fmpz_clear(t);
arf_clear(y);
}
flint_printf("\n");
}
/* c_n = (20! / n!) */
ulong cs[] = {
UWORD(2432902008176640000),
UWORD(2432902008176640000),
UWORD(1216451004088320000),
UWORD(405483668029440000),
UWORD(101370917007360000),
UWORD(20274183401472000),
UWORD(3379030566912000),
UWORD(482718652416000),
UWORD(60339831552000),
UWORD(6704425728000),
UWORD(670442572800),
UWORD(60949324800),
UWORD(5079110400),
UWORD(390700800),
UWORD(27907200),
UWORD(1860480),
UWORD(116280),
UWORD(6840),
UWORD(380),
UWORD(20),
UWORD(1),
};
mp_limb_t __gmpn_preinv_divrem_1(mp_ptr qp, mp_size_t fn, mp_srcptr np, mp_size_t nn, mp_limb_t d, mp_limb_t dinv, int cnt);
/* Single-limb inverse of 20! */
ulong c0_preinv1;
int c0_preinv1_norm;
/* Multi-limb inverse of 20! */
mp_limb_t c0inv[100];
void
precompute()
{
c0_preinv1 = n_preinvert_limb(cs[0]);
c0_preinv1_norm = FLINT_BITS - FLINT_BIT_COUNT(cs[0]);
/* 1/cs[0] */
flint_mpn_zero(c0inv, 11);
c0inv[11] = 1;
mpn_divrem_1(c0inv, 0, c0inv, 12, cs[0]);
}
/* a0 + x (a1 + x (a2 + x (a3 + x (a4 + x (a5 + x (a6 + x (a7 + x (a8 + x a9)))))))) */
void
mpn_exp_series_10_horner(mp_ptr res, mp_srcptr c, mp_srcptr x)
{
mp_limb_t t[10];
mp_limb_t s[10];
s[1] = c[9];
s[0] = 0;
flint_mpn_mulhigh_n(t, s, x + 9 - 2, 2);
t[2] = c[8];
flint_mpn_mulhigh_n(s, t, x + 9 - 3, 3);
s[3] = c[7];
flint_mpn_mulhigh_n(t, s, x + 9 - 4, 4);
t[4] = c[6];
flint_mpn_mulhigh_n(s, t, x + 9 - 5, 5);
s[5] = c[5];
flint_mpn_mulhigh_n(t, s, x + 9 - 6, 6);
t[6] = c[4];
flint_mpn_mulhigh_n(s, t, x + 9 - 7, 7);
s[7] = c[3];
flint_mpn_mulhigh_n(t, s, x + 9 - 8, 8);
t[8] = c[2];
flint_mpn_mulhigh_n(s, t, x + 9 - 9, 9);
s[9] = c[1];
flint_mpn_mulhigh_n(res + 1, s + 1, x, 9);
res[10] = c[0];
}
/* (a0 + a1 x + a2 x^2) + x^3 * ((a3 + a4 x + a5 x^2) + x^3*((a6 + a7 x + a8 x^2) + a9 x^3))
todo: consider
a0 + (a1 x + a2 x^2) + a3 x^3 + x^3 * ((a4 x + a5 x^2) + a6 x^3 + x^3*((a7 x + a8 x^2) + a9 x^3))
to reduce the precision of the muls
*/
void
mpn_exp_series_10_rs3b(mp_ptr res, mp_srcptr c, mp_srcptr x)
{
mp_limb_t x2[8];
mp_limb_t x3p[8];
mp_limb_t s[11];
mp_limb_t t[11];
/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
mp_ptr x3 = x3p + 1;
x3p[0] = 0;
/* x^2 with 8 limbs of precision */
flint_mpn_sqrhigh(x2, x + 1, 8);
/* x^3 with 7 limbs of precision */
flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
umul_ppmm(t[1], t[0], x3[6], c[9]);
t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
t[4] = c[6];
flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
s[7] = c[3];
flint_mpn_mulhigh_n(res, s, x3 - 1, 8);
res[8] = mpn_addmul_1(res, x2, 8, c[2]);
__gmpn_preinv_divrem_1(res, 0, res, 9, c[0], c0_preinv1, c0_preinv1_norm);
res[9] = mpn_add_n(res, res, x, 9);
res[10] = 1;
}
void
mpn_exp_series_10_rs3c(mp_ptr res, mp_srcptr c, mp_srcptr x)
{
mp_limb_t x2[8];
mp_limb_t x3p[8];
mp_limb_t s[11];
mp_limb_t t[11];
/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
mp_ptr x3 = x3p + 1;
x3p[0] = 0;
/* x^2 with 8 limbs of precision */
flint_mpn_sqrhigh(x2, x + 1, 8);
/* x^3 with 7 limbs of precision */
flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
umul_ppmm(t[1], t[0], x3[6], c[9]);
t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
t[4] = c[6];
flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
s[7] = c[3];
flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
t[8] = mpn_addmul_1(t, x2, 8, c[2]);
flint_mpn_mulhigh_n(res, t, c0inv + 2, 9);
res[9] = mpn_add_n(res, res, x, 9);
res[10] = 1;
}
void
mpn_exp_series_10_rs3d(mp_ptr res, mp_srcptr c, mp_srcptr x)
{
mp_limb_t x2[8];
mp_limb_t x3d[8];
mp_limb_t s[11];
mp_limb_t t[11];
/* zero-pad x^3 so that we can fudge the final the 8x7 horner mul by x^3
as an 8x8 mulhigh_n */
x3d[0] = 0;
mp_ptr x3 = x3d + 1;
/* x^2 with 8 limbs of precision */
flint_mpn_sqrhigh(x2, x + 1, 8);
/* x^3 with 7 limbs of precision */
flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
umul_ppmm(t[1], t[0], x3[6], c[9]); /* t[1] = mpn_addmul_1(t, x3 + 8 - 1, 1, c[9]); */
t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
t[4] = c[6];
flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
s[7] = c[3];
flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
/* divide by factorial */
flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);
/* + x^2 / 2 (9 limbs) */
mpn_rshift(t, x2, 8, 1);
res[8] = mpn_add_n(res, res, t, 8);
/* + x (10 limbs) */
res[9] = mpn_add_n(res, res, x, 9);
/* + 1 */
res[10] = 1;
}
/* (a0 + a1 x + a2 x^2 + a3 x^3) + x^4 * ((a4 + a5 x + a6 x^2 + a7 x^3) + x^4* (a8 + a9 x) */
void
mpn_exp_series_10_rs4(mp_ptr res, mp_srcptr c, mp_srcptr x)
{
mp_limb_t x2[8];
mp_limb_t x3[8];
mp_limb_t x4p[8];
mp_limb_t s[11];
mp_limb_t t[11];
/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
mp_ptr x4 = x4p + 1;
x4p[0] = 0;
flint_mpn_sqrhigh(x2, x + 1, 8);
flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
flint_mpn_sqrhigh(x4, x2 + 2, 6);
umul_ppmm(t[1], t[0], x[8], c[9]); /* t[1] = mpn_mul_1(t, x + 9 - 1, 1, c[9]); */
t[2] = c[8];
flint_mpn_mulhigh_n(s, t, x4 + 6 - 3, 3);
s[3] = mpn_addmul_1(s, x3 + 7 - 3, 3, c[7]);
s[4] = mpn_addmul_1(s, x2 + 8 - 4, 4, c[6]);
s[5] = mpn_addmul_1(s, x + 9 - 5, 5, c[5]);
s[6] = c[4];
flint_mpn_mulhigh_n(t, s, x4 + 6 - 7, 7);
t[7] = mpn_addmul_1(t, x3, 7, c[3]);
/* divide by factorial */
flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);
/* + x^2 / 2 (9 limbs) */
mpn_rshift(t, x2, 8, 1);
res[8] = mpn_add_n(res, res, t, 8);
/* + x (10 limbs) */
res[9] = mpn_add_n(res, res, x, 9);
/* + 1 */
res[10] = 1;
}
int main()
{
mp_limb_t X[400], Y[400], Z[400];
precompute();
flint_mpn_zero(X, 100);
flint_mpn_zero(Y, 100);
#if 0
ulong dinv = n_preinvert_limb(123987);
int cnt = 64 - FLINT_BIT_COUNT(123987);
slong n;
for (n = 1; n < 20; n++)
{
flint_printf("%wd\n", n);
TIMEIT_START
flint_mpn_mulhigh_n(Z, X, Y, n);
TIMEIT_STOP
TIMEIT_START
mpn_divrem_1(X, 0, X, n, 123987);
TIMEIT_STOP
TIMEIT_START
__gmpn_preinv_divrem_1(X, 0, X, n, 123987, dinv, cnt);
TIMEIT_STOP
}
#endif
/* x = 1/10^20 < 2^-64 */
X[10] = 1;
mpn_divrem_1(X, 0, X, 11, n_pow(10, 10));
mpn_divrem_1(X, 0, X, 10, n_pow(10, 10));
arb_t y, x;
arb_init(x);
arb_init(y);
arb_set_str(x, "1e-20", 640);
mag_zero(arb_radref(x));
flint_printf("arb_exp\n");
TIMEIT_START
arb_exp(y, x, 640);
TIMEIT_STOP
arb_printn(y, 0.3010299956639812 * 640, ARB_STR_NO_RADIUS); flint_printf("\n");
flint_printf("\nmpn_exp_series_10_rs3d (mulhigh):\n");
TIMEIT_START
mpn_exp_series_10_rs3d(Y, cs, X);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
flint_printf("\n_arb_exp_taylor_rs:\n");
TIMEIT_START
_arb_exp_taylor_rs(Z, Y + 50, X, 10, 10);
TIMEIT_STOP
flint_mpn_print_fixed(Z, 11, 10);
flint_printf("\nmpn_exp_series_10_horner (divrem_1):\n");
TIMEIT_START
mpn_exp_series_10_horner(Y, cs, X);
mpn_divrem_1(Y, 0, Y, 11, cs[0]);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
#if 0
flint_printf("\nmpn_exp_series_10_horner (preinv_divrem_1):\n");
TIMEIT_START
mpn_exp_series_10_horner(Y, cs, X);
__gmpn_preinv_divrem_1(Y, 0, Y, 11, cs[0], c0_preinv1, c0_preinv1_norm);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
flint_printf("\nmpn_exp_series_10_horner (mulhigh):\n");
TIMEIT_START
mpn_exp_series_10_horner(Z, cs, X);
flint_mpn_mulhigh_n(Y, Z, c0inv, 11);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
#endif
flint_printf("\nmpn_exp_series_10_rs3d:\n");
TIMEIT_START
mpn_exp_series_10_rs3c(Y, cs, X);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
flint_printf("\nmpn_exp_series_10_rs4:\n");
TIMEIT_START
mpn_exp_series_10_rs4(Y, cs, X);
TIMEIT_STOP
flint_mpn_print_fixed(Y, 11, 10);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment