Skip to content

Instantly share code, notes, and snippets.

@fredrik-johansson
Created January 25, 2024 20:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fredrik-johansson/2f88376c5dd9dc0b668e7b6d91b9d266 to your computer and use it in GitHub Desktop.
Save fredrik-johansson/2f88376c5dd9dc0b668e7b6d91b9d266 to your computer and use it in GitHub Desktop.
mulhigh.c
#include <stdint.h>
#include "profiler.h"
#include "mpn_extras.h"
#include "crt_helpers.h"
#include "ulong_extras.h"
FLINT_FORCE_INLINE mp_limb_t
flint_mpn_inline_addmul_1(mp_ptr res, mp_srcptr a, mp_size_t n, mp_limb_t c)
{
mp_limb_t hi, lo, cy_limb;
slong i;
umul_ppmm(hi, lo, a[0], c);
add_ssaaaa(cy_limb, res[0], hi, lo, 0, res[0]);
for (i = 1; i < n; i++)
{
umul_ppmm(hi, lo, a[i], c);
add_ssaaaa(hi, lo, hi, lo, 0, res[i]);
add_ssaaaa(cy_limb, res[i], hi, lo, 0, cy_limb);
}
return cy_limb;
}
#undef NN_DOTREV_S3_1X1
/* {s0,s1,s2} = u[0]v[n-1] + u[1]v[n-2] + ... */
/* Assumes n >= 2 */
#define NN_DOTREV_S3_1X1(s2, s1, s0, u, v, n) \
do { \
mp_limb_t __dt0, __dt1, __ds0, __ds1, __ds2; \
slong __i; \
FLINT_ASSERT((n) >= 2); \
umul_ppmm(__ds1, __ds0, (u)[0], (v)[(n) - 1]); \
umul_ppmm(__dt1, __dt0, (u)[1], (v)[(n) - 2]); \
add_sssaaaaaa(__ds2, __ds1, __ds0, 0, __ds1, __ds0, 0, __dt1, __dt0); \
for (__i = 2; __i < (n); __i++) \
{ \
umul_ppmm(__dt1, __dt0, (u)[__i], (v)[(n) - 1 - __i]); \
add_sssaaaaaa(__ds2, __ds1, __ds0, __ds2, __ds1, __ds0, 0, __dt1, __dt0); \
} \
(s0) = __ds0; (s1) = __ds1; (s2) = __ds2; \
} while (0) \
/* {r0,r1,r2} = {s0,s1,s2} + u[0]v[n-1] + u[1]v[n-2] + ... */
/* Assumes n >= 1. May have s2 != 0, but the final sum is assumed to fit in 3 limbs. */
#define NN_DOTREV_S3_A3_1X1(r2, r1, r0, s2, s1, s0, u, v, n) \
do { \
mp_limb_t __dt0, __dt1, __ds0, __ds1, __ds2; \
slong __i; \
FLINT_ASSERT((n) >= 1); \
__ds0 = (s0); __ds1 = (s1); __ds2 = (s2); \
for (__i = 0; __i < (n); __i++) \
{ \
umul_ppmm(__dt1, __dt0, (u)[__i], (v)[(n) - 1 - __i]); \
add_sssaaaaaa(__ds2, __ds1, __ds0, __ds2, __ds1, __ds0, 0, __dt1, __dt0); \
} \
(r0) = __ds0; (r1) = __ds1; (r2) = __ds2; \
} while (0) \
#define NN_MUL_1X1 umul_ppmm
/* {r0,r1} = {s0,s1} + x * y, with no carry-out. */
#define NN_ADDMUL_S2_A2_1X1(r1, r0, s1, s0, x, y) \
do { \
mp_limb_t __dt0, __dt1; \
umul_ppmm(__dt1, __dt0, (x), (y)); \
add_ssaaaa(r1, r0, s1, s0, __dt1, __dt0); \
} while (0); \
void flint_mpn_mulhigh_1(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
NN_MUL_1X1(res[1], res[0], u[0], v[0]);
}
void flint_mpn_mulhigh_2(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[1], u, v, 2);
NN_ADDMUL_S2_A2_1X1(res[3], res[2], b, a, u[1], v[1]);
}
void flint_mpn_mulhigh_3(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[2], u, v, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[3], 0, b, a, u + 1, v + 1, 2);
NN_ADDMUL_S2_A2_1X1(res[5], res[4], b, a, u[2], v[2]);
}
void flint_mpn_mulhigh_4(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[3], u, v, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[4], 0, b, a, u + 1, v + 1, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[5], 0, b, a, u + 2, v + 2, 2);
NN_ADDMUL_S2_A2_1X1(res[7], res[6], b, a, u[3], v[3]);
}
void flint_mpn_mulhigh_5(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[4], u, v, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[5], 0, b, a, u + 1, v + 1, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[6], 0, b, a, u + 2, v + 2, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 3, v + 3, 2);
NN_ADDMUL_S2_A2_1X1(res[9], res[8], b, a, u[4], v[4]);
}
void flint_mpn_mulhigh_6(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[5], u, v, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[6], 0, b, a, u + 1, v + 1, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 2, v + 2, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 3, v + 3, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 4, v + 4, 2);
NN_ADDMUL_S2_A2_1X1(res[11], res[10], b, a, u[5], v[5]);
}
void flint_mpn_mulhigh_7(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[6], u, v, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 1, v + 1, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 2, v + 2, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 3, v + 3, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 4, v + 4, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 5, v + 5, 2);
NN_ADDMUL_S2_A2_1X1(res[13], res[12], b, a, u[6], v[6]);
}
void flint_mpn_mulhigh_8(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[7], u, v, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 1, v + 1, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 2, v + 2, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 3, v + 3, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 4, v + 4, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 5, v + 5, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 6, v + 6, 2);
NN_ADDMUL_S2_A2_1X1(res[15], res[14], b, a, u[7], v[7]);
}
void flint_mpn_mulhigh_9(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[8], u, v, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 1, v + 1, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 2, v + 2, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 3, v + 3, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 4, v + 4, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 5, v + 5, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 6, v + 6, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 7, v + 7, 2);
NN_ADDMUL_S2_A2_1X1(res[17], res[16], b, a, u[8], v[8]);
}
void flint_mpn_mulhigh_10(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[9], u, v, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 1, v + 1, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 2, v + 2, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 3, v + 3, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 4, v + 4, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 5, v + 5, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 6, v + 6, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 7, v + 7, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 8, v + 8, 2);
NN_ADDMUL_S2_A2_1X1(res[19], res[18], b, a, u[9], v[9]);
}
void flint_mpn_mulhigh_11(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[10], u, v, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 1, v + 1, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 2, v + 2, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 3, v + 3, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 4, v + 4, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 5, v + 5, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 6, v + 6, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 7, v + 7, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 8, v + 8, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 9, v + 9, 2);
NN_ADDMUL_S2_A2_1X1(res[21], res[20], b, a, u[10], v[10]);
}
void flint_mpn_mulhigh_12(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[11], u, v, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 1, v + 1, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 2, v + 2, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 3, v + 3, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 4, v + 4, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 5, v + 5, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 6, v + 6, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 7, v + 7, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 8, v + 8, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 9, v + 9, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 10, v + 10, 2);
NN_ADDMUL_S2_A2_1X1(res[23], res[22], b, a, u[11], v[11]);
}
void flint_mpn_mulhigh_13(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[12], u, v, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 1, v + 1, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 2, v + 2, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 3, v + 3, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 4, v + 4, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 5, v + 5, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 6, v + 6, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 7, v + 7, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 8, v + 8, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 9, v + 9, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 10, v + 10, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 11, v + 11, 2);
NN_ADDMUL_S2_A2_1X1(res[25], res[24], b, a, u[12], v[12]);
}
void flint_mpn_mulhigh_14(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[13], u, v, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 1, v + 1, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 2, v + 2, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 3, v + 3, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 4, v + 4, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 5, v + 5, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 6, v + 6, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 7, v + 7, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 8, v + 8, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 9, v + 9, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 10, v + 10, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 11, v + 11, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 12, v + 12, 2);
NN_ADDMUL_S2_A2_1X1(res[27], res[26], b, a, u[13], v[13]);
}
void flint_mpn_mulhigh_15(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[14], u, v, 15);
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 1, v + 1, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 2, v + 2, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 3, v + 3, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 4, v + 4, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 5, v + 5, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 6, v + 6, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 7, v + 7, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 8, v + 8, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 9, v + 9, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 10, v + 10, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 11, v + 11, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 12, v + 12, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 13, v + 13, 2);
NN_ADDMUL_S2_A2_1X1(res[29], res[28], b, a, u[14], v[14]);
}
void flint_mpn_mulhigh_16(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[15], u, v, 16);
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 1, v + 1, 15);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 2, v + 2, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 3, v + 3, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 4, v + 4, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 5, v + 5, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 6, v + 6, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 7, v + 7, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 8, v + 8, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 9, v + 9, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 10, v + 10, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 11, v + 11, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 12, v + 12, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 13, v + 13, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 14, v + 14, 2);
NN_ADDMUL_S2_A2_1X1(res[31], res[30], b, a, u[15], v[15]);
}
void flint_mpn_mulhigh_17(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[16], u, v, 17);
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 1, v + 1, 16);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 2, v + 2, 15);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 3, v + 3, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 4, v + 4, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 5, v + 5, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 6, v + 6, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 7, v + 7, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 8, v + 8, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 9, v + 9, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 10, v + 10, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 11, v + 11, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 12, v + 12, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 13, v + 13, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 14, v + 14, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 15, v + 15, 2);
NN_ADDMUL_S2_A2_1X1(res[33], res[32], b, a, u[16], v[16]);
}
void flint_mpn_mulhigh_18(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[17], u, v, 18);
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 1, v + 1, 17);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 2, v + 2, 16);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 3, v + 3, 15);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 4, v + 4, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 5, v + 5, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 6, v + 6, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 7, v + 7, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 8, v + 8, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 9, v + 9, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 10, v + 10, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 11, v + 11, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 12, v + 12, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 13, v + 13, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 14, v + 14, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[32], 0, b, a, u + 15, v + 15, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[33], 0, b, a, u + 16, v + 16, 2);
NN_ADDMUL_S2_A2_1X1(res[35], res[34], b, a, u[17], v[17]);
}
void flint_mpn_mulhigh_19(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
mp_limb_t a, b;
NN_DOTREV_S3_1X1(b, a, res[18], u, v, 19);
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 1, v + 1, 18);
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 2, v + 2, 17);
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 3, v + 3, 16);
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 4, v + 4, 15);
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 5, v + 5, 14);
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 6, v + 6, 13);
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 7, v + 7, 12);
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 8, v + 8, 11);
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 9, v + 9, 10);
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 10, v + 10, 9);
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 11, v + 11, 8);
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 12, v + 12, 7);
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 13, v + 13, 6);
NN_DOTREV_S3_A3_1X1(b, a, res[32], 0, b, a, u + 14, v + 14, 5);
NN_DOTREV_S3_A3_1X1(b, a, res[33], 0, b, a, u + 15, v + 15, 4);
NN_DOTREV_S3_A3_1X1(b, a, res[34], 0, b, a, u + 16, v + 16, 3);
NN_DOTREV_S3_A3_1X1(b, a, res[35], 0, b, a, u + 17, v + 17, 2);
NN_ADDMUL_S2_A2_1X1(res[37], res[36], b, a, u[18], v[18]);
}
void
flint_mpn_mulhigh_n_classical(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n)
{
slong i;
res += n - 1;
umul_ppmm(res[1], res[0], u[n - 1], v[0]);
for (i = 2 ; i <= n ; i++)
res[i] = mpn_addmul_1(res, u + n - i, i, v[i - 1]);
}
void flint_mpn_mulhigh_n_basecase(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n)
{
switch (n)
{
case 0: FLINT_UNREACHABLE;
case 1: flint_mpn_mulhigh_1(rp, np, mp); break;
case 2: flint_mpn_mulhigh_2(rp, np, mp); break;
case 3: flint_mpn_mulhigh_3(rp, np, mp); break;
case 4: flint_mpn_mulhigh_4(rp, np, mp); break;
case 5: flint_mpn_mulhigh_5(rp, np, mp); break;
case 6: flint_mpn_mulhigh_6(rp, np, mp); break;
case 7: flint_mpn_mulhigh_7(rp, np, mp); break;
case 8: flint_mpn_mulhigh_8(rp, np, mp); break;
case 9: flint_mpn_mulhigh_9(rp, np, mp); break;
case 10: flint_mpn_mulhigh_10(rp, np, mp); break;
case 11: flint_mpn_mulhigh_11(rp, np, mp); break;
case 12: flint_mpn_mulhigh_12(rp, np, mp); break;
case 13: flint_mpn_mulhigh_13(rp, np, mp); break;
case 14: flint_mpn_mulhigh_14(rp, np, mp); break;
case 15: flint_mpn_mulhigh_15(rp, np, mp); break;
case 16: flint_mpn_mulhigh_16(rp, np, mp); break;
case 17: flint_mpn_mulhigh_17(rp, np, mp); break;
case 18: flint_mpn_mulhigh_18(rp, np, mp); break;
case 19: flint_mpn_mulhigh_19(rp, np, mp); break;
default: flint_mpn_mulhigh_n_classical(rp, np, mp, n); break;
}
}
#if 0
FLINT_FORCE_INLINE
void flint_mpn_mulhigh_1(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[0], v[0]);
}
FLINT_FORCE_INLINE
void flint_mpn_mulhigh_2(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[1], v[0]);
res[2] = flint_mpn_inline_addmul_1(res, u + 0, 2, v[1]);
}
FLINT_FORCE_INLINE
void flint_mpn_mulhigh_3(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[2], v[0]);
res[2] = flint_mpn_inline_addmul_1(res, u + 1, 2, v[1]);
res[3] = flint_mpn_inline_addmul_1(res, u + 0, 3, v[2]);
}
FLINT_FORCE_INLINE
void flint_mpn_mulhigh_4(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[3], v[0]);
res[2] = flint_mpn_inline_addmul_1(res, u + 2, 2, v[1]);
res[3] = flint_mpn_inline_addmul_1(res, u + 1, 3, v[2]);
res[4] = flint_mpn_inline_addmul_1(res, u + 0, 4, v[3]);
}
void flint_mpn_mulhigh_5(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[4], v[0]);
res[2] = flint_mpn_inline_addmul_1(res, u + 3, 2, v[1]);
res[3] = flint_mpn_inline_addmul_1(res, u + 2, 3, v[2]);
res[4] = flint_mpn_inline_addmul_1(res, u + 1, 4, v[3]);
res[5] = flint_mpn_inline_addmul_1(res, u + 0, 5, v[4]);
}
void flint_mpn_mulhigh_6a(mp_ptr res, mp_srcptr u, mp_srcptr v)
{
umul_ppmm(res[1], res[0], u[5], v[0]);
res[2] = flint_mpn_inline_addmul_1(res, u + 4, 2, v[1]);
res[3] = flint_mpn_inline_addmul_1(res, u + 3, 3, v[2]);
res[4] = flint_mpn_inline_addmul_1(res, u + 2, 4, v[3]);
res[5] = flint_mpn_inline_addmul_1(res, u + 1, 5, v[4]);
res[6] = flint_mpn_inline_addmul_1(res, u + 0, 6, v[5]);
}
#endif
/* Tuning values against GMP 6.3.0 and fft_small on Zen3. */
/* Table stores k / 4 so that entries can fit in a byte. */
#define KTAB_LEN 800
static unsigned char flint_mpn_mulhigh_ktab[KTAB_LEN] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 12, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 16, 14, 14, 16, 16, 16, 16, 16, 16, 17, 19, 16, 19, 19, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 19,
20, 19, 22, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 27, 24, 24, 24, 24, 25, 24, 27, 24, 24, 27, 24, 27, 27, 24, 27, 27, 27, 27,
27, 27, 26, 27, 24, 27, 27, 27, 27, 26, 27, 29, 28, 29, 28, 27, 27, 27, 29, 29, 30, 28, 27, 27, 27, 28, 28, 27, 33, 27, 27, 33,
28, 34, 36, 30, 29, 35, 36, 37, 35, 36, 35, 35, 36, 37, 36, 37, 36, 36, 37, 37, 36, 37, 38, 37, 37, 34, 39, 38, 36, 37, 37, 38,
37, 36, 37, 37, 36, 37, 37, 38, 37, 38, 38, 37, 36, 38, 36, 35, 36, 37, 43, 47, 47, 47, 39, 47, 47, 38, 47, 47, 47, 47, 43, 47,
47, 47, 46, 47, 47, 47, 43, 47, 47, 47, 43, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 51, 47, 47, 47, 47, 47, 47, 47, 55, 47, 55,
51, 55, 46, 47, 51, 51, 51, 51, 55, 51, 51, 51, 55, 47, 51, 51, 47, 47, 55, 55, 55, 47, 55, 54, 51, 55, 55, 54, 63, 59, 55, 59,
55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 64, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 67, 55, 64, 67, 64, 67, 67, 63,
67, 66, 59, 66, 67, 67, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 63, 67, 67, 67, 67, 67, 67, 71, 71, 71, 67, 67, 72, 67,
67, 67, 67, 71, 67, 75, 72, 75, 75, 75, 71, 76, 75, 76, 75, 71, 71, 75, 82, 75, 82, 75, 75, 75, 76, 82, 75, 75, 82, 75, 82, 82,
76, 82, 75, 82, 82, 82, 82, 82, 76, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 94, 110, 108, 112, 109, 110, 112, 108, 111, 109,
112, 110, 111, 112, 113, 110, 112, 112, 110, 113, 114, 115, 115, 113, 112, 117, 115, 118, 115, 118, 116, 119, 115, 119, 117, 120, 120, 118, 119, 119, 120, 119,
118, 119, 120, 119, 119, 118, 120, 120, 119, 120, 120, 120, 120, 120, 119, 119, 120, 119, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 141, 120, 140, 136, 140, 140, 139, 138, 139, 140, 141, 140, 144, 141, 142, 144, 144, 144,
141, 144, 144, 144, 144, 142, 144, 144, 144, 144, 144, 144, 144, 143, 143, 144, 143, 144, 144, 144, 144, 144, 144, 144, 144, 143, 144, 144, 144, 144, 144, 143,
144, 144, 143, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 141, 144, 144, 144, 144, 144, 144, 144, 144, 144,
144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 142, 143, 144, 144, 144, 144, 142, 144, 143, 143, 144, 144, 144, 140, 141, 141, 141,
144, 143, 139, 143, 142, 144, 139, 142, 139, 141, 140, 141, 144, 142, 142, 143, 143, 144, 143, 144, 143, 144, 141, 143, 143, 143, 141, 142, 144, 143, 142, 144,
140, 144, 143, 140, 142, 141, 141, 144, 142, 142, 144, 143, 144, 143, 142, 142, 142, 142, 143, 143, 143, 144, 144, 144, 140, 141, 142, 140, 143, 142, 142, 144,
144, 144, 144, 255, 255, 255, 255, 255, 255, 143, 255, 255, 255, 255, 255, 194, 255, 194, 193, 193, 193, 194, 195, 194, 193, 195, 255, 194, 194, 193, 194, 195,
};
void flint_mpn_mulhigh_n(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
void
flint_mpn_mulhigh_n_recursive(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n, int k)
{
mp_limb_t cy;
mp_size_t l;
if (k == 0)
{
flint_mpn_mulhigh_n_basecase(res, u, v, n);
return;
}
else if (k == 255)
{
flint_mpn_mul_n(res, u, v, n);
return;
}
/* Required for correctness (see MPFR). */
k = FLINT_MAX(k, (n + 4) / 2);
k = FLINT_MIN(k, n - 1);
l = n - k;
flint_mpn_mul_n(res + 2 * l, u + l, v + l, k);
flint_mpn_mulhigh_n(res, u + k, v, l);
cy = mpn_add_n(res + n - 1, res + n - 1, res + l - 1, l + 1);
flint_mpn_mulhigh_n(res, u, v + k, l);
cy += mpn_add_n(res + n - 1, res + n - 1, res + l - 1, l + 1);
mpn_add_1(res + n + l, res + n + l, k, cy);
}
void
flint_mpn_mulhigh_n(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n)
{
int k;
switch (n)
{
case 1: flint_mpn_mulhigh_1(res, u, v); break;
case 2: flint_mpn_mulhigh_2(res, u, v); break;
case 3: flint_mpn_mulhigh_3(res, u, v); break;
case 4: flint_mpn_mulhigh_4(res, u, v); break;
case 5: flint_mpn_mulhigh_5(res, u, v); break;
case 6: flint_mpn_mulhigh_6(res, u, v); break;
case 7: flint_mpn_mulhigh_7(res, u, v); break;
case 8: flint_mpn_mulhigh_8(res, u, v); break;
case 9: flint_mpn_mulhigh_9(res, u, v); break;
case 10: flint_mpn_mulhigh_10(res, u, v); break;
case 11: flint_mpn_mulhigh_11(res, u, v); break;
case 12: flint_mpn_mulhigh_12(res, u, v); break;
case 13: flint_mpn_mulhigh_13(res, u, v); break;
case 14: flint_mpn_mulhigh_14(res, u, v); break;
case 15: flint_mpn_mulhigh_15(res, u, v); break;
case 16: flint_mpn_mulhigh_16(res, u, v); break;
case 17: flint_mpn_mulhigh_17(res, u, v); break;
case 18: flint_mpn_mulhigh_18(res, u, v); break;
case 19: flint_mpn_mulhigh_19(res, u, v); break;
default:
{
if (n < KTAB_LEN)
{
k = (int) flint_mpn_mulhigh_ktab[n];
if (k != 255)
k *= 4;
if (k == 0)
flint_mpn_mulhigh_n_basecase(res, u, v, n);
else
flint_mpn_mulhigh_n_recursive(res, u, v, n, k);
}
else
{
flint_mpn_mul_n(res, u, v, n);
}
}
}
}
void mpfr_mulhigh_n (mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
void mpfr_sqrhigh_n(mp_ptr rp, mp_srcptr np, mp_size_t n);
#undef TIMEIT_END_REPEAT
#undef TIMEIT_STOP_VALUES
#define TIMEIT_END_REPEAT(__timer, __reps) \
} \
timeit_stop(__timer); \
if (__timer->cpu >= 10) \
break; \
__reps *= 10; \
} \
} while (0);
#define TIMEIT_STOP_VALUES(tcpu, twall) \
TIMEIT_END_REPEAT(__timer, __reps) \
(tcpu) = __timer->cpu*0.001 / __reps; \
(twall) = __timer->wall*0.001 / __reps; \
} while (0);
slong next_r(slong r)
{
if (r <= 15)
return r + 1;
if ((r & (r - 1)) == 0)
r = (3 * r) / 2;
else
r = (4 * r) / 3;
return r;
}
int main()
{
int iter;
flint_rand_t state;
flint_randinit(state);
for (iter = 0; iter < 100000; iter++)
{
slong i, N;
mp_ptr X, Y, R1, R2, ERR, TOL;
if (n_randint(state, 100) == 0)
N = 1 + n_randtest(state) % 1000;
else if (n_randint(state, 10) == 0)
N = 1 + n_randtest(state) % 100;
else
N = 1 + n_randtest(state) % 20;
X = flint_malloc(sizeof(mp_limb_t) * N);
Y = flint_malloc(sizeof(mp_limb_t) * N);
R1 = flint_malloc(sizeof(mp_limb_t) * 2 * N);
R2 = flint_malloc(sizeof(mp_limb_t) * 2 * N);
ERR = flint_malloc(sizeof(mp_limb_t) * (N + 1));
TOL = flint_malloc(sizeof(mp_limb_t) * (N + 1));
for (i = 0; i < N; i++)
{
X[i] = n_randtest(state);
Y[i] = n_randtest(state);
}
for (i = 0; i < 2 * N; i++)
R1[i] = n_randtest(state);
flint_mpn_mulhigh_n(R1, X, Y, N);
flint_mpn_mul_n(R2, X, Y, N);
ERR[N] = mpn_sub_n(ERR, R2 + N, R1 + N, N);
mpn_zero(TOL + 1, N);
TOL[0] = N - 1;
if (mpn_cmp(ERR, TOL, N + 1) > 0)
{
flint_printf("FAIL: N = %wd\n", N);
flint_printf("X = "); flint_mpn_debug(X, N);
flint_printf("Y = "); flint_mpn_debug(Y, N);
flint_printf("R1 = "); flint_mpn_debug(R1, 2 * N);
flint_printf("R2 = "); flint_mpn_debug(R2, 2 * N);
flint_printf("ERR = "); flint_mpn_debug(ERR, N + 1);
flint_printf("TOL = "); flint_mpn_debug(TOL, N + 1);
flint_abort();
}
flint_free(X);
flint_free(Y);
flint_free(R1);
flint_free(R2);
flint_free(ERR);
flint_free(TOL);
}
flint_printf("OK\n");
{
slong i, n;
mp_limb_t x[4000], y[2000], z[2000];
for (i = 0; i < 2000; i++)
{
x[i] = n_randtest(state);
y[i] = n_randtest(state);
z[i] = n_randtest(state);
}
flint_printf("\n");
flint_printf("n, flint_mpn_mul_n, flint_mpn_mul_n / mpfr, flint_mpn_mul_n / flint_mpn_mulhigh_n\n");
for (n = 1; n <= 2000; n = next_r(n))
{
double t1, t2, t3, tt, __attribute__((unused)) __;
TIMEIT_START
flint_mpn_mul_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, t1)
TIMEIT_START
flint_mpn_mul_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, tt)
t1 = FLINT_MIN(t1, tt);
TIMEIT_START
mpfr_mulhigh_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, t2)
TIMEIT_START
mpfr_mulhigh_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, tt)
t2 = FLINT_MIN(t2, tt);
TIMEIT_START
flint_mpn_mulhigh_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, t3)
TIMEIT_START
flint_mpn_mulhigh_n(z, x, y, n);
TIMEIT_STOP_VALUES(__, tt)
t3 = FLINT_MIN(t3, tt);
flint_printf("%wd %g %.3f %.3f %s\n", n, t1, t1 / t2, t1 / t3, (t3 < 1.01 * t2 ? " " : "!!!!!"));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment