Skip to content

Instantly share code, notes, and snippets.

@fredrik-johansson
Last active December 8, 2023 22:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fredrik-johansson/952587923f3e90a9f76e9302bb72a549 to your computer and use it in GitHub Desktop.
Save fredrik-johansson/952587923f3e90a9f76e9302bb72a549 to your computer and use it in GitHub Desktop.
mpn instrinsics
#include <x86intrin.h>
#include "flint.h"
#include "longlong.h"
#include "ulong_extras.h"
#include "mpn_extras.h"
#include "profiler.h"
#define umul_ppmm_mulx(w1, w0, u, v) \
__asm__ ("mulx\t%3, %q0, %q1" \
: "=r" (w0), "=r" (w1) \
: "%d" ((ulong)(u)), "rm" ((ulong)(v)))
#define umul_ppmm_int128(r1, r0, u, v) do { __uint128_t __rh = (__uint128_t) u * (__uint128_t) v; (r0) = (ulong) __rh; (r1) = (ulong) (__rh >> 64); } while (0)
FLINT_FORCE_INLINE unsigned char n_addc_intrinsic(unsigned char cf, ulong x, ulong y, ulong* s)
{
long long unsigned int _s;
cf = _addcarry_u64(cf, (long long unsigned int)(x),
(long long unsigned int)(y),
&_s);
*s = (ulong)(_s);
return cf;
}
#if 1
/* code suggested on https://gcc.gnu.org/onlinedocs/gcc/Integer-Overflow-Builtins.html */
#define __builtin_addcl(a, b, carry_in, carry_out) \
({ __typeof__ (a) s; \
__typeof__ (a) c1 = __builtin_add_overflow (a, b, &s); \
__typeof__ (a) c2 = __builtin_add_overflow (s, carry_in, &s); \
*(carry_out) = c1 | c2; \
s; })
#endif
FLINT_FORCE_INLINE unsigned char n_addc_builtin(unsigned char cf, ulong x, ulong y, ulong* s)
{
ulong cy;
*s = __builtin_addcl(x, y, cf, &cy);
return cy;
}
/* generic version from GMP */
mp_limb_t
mpn_addmul_1_a(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
do
{
u0 = *up++;
umul_ppmm(p1, p0, u0, v0);
r0 = *rp;
p0 = r0 + p0;
c = r0 > p0;
p1 = p1 + c;
r0 = p0 + crec;
c = p0 > r0;
crec = p1 + c;
*rp++ = r0;
}
while (--n != 0);
return crec;
}
mp_limb_t
mpn_addmul_1_b(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
do
{
u0 = *up++;
umul_ppmm_mulx(p1, p0, u0, v0);
r0 = *rp;
p0 = r0 + p0;
c = r0 > p0;
p1 = p1 + c;
r0 = p0 + crec;
c = p0 > r0;
crec = p1 + c;
*rp++ = r0;
}
while (--n != 0);
return crec;
}
mp_limb_t
mpn_addmul_1_c(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
do
{
u0 = *up++;
umul_ppmm_int128(p1, p0, u0, v0);
r0 = *rp;
p0 = r0 + p0;
c = r0 > p0;
p1 = p1 + c;
r0 = p0 + crec;
c = p0 > r0;
crec = p1 + c;
*rp++ = r0;
}
while (--n != 0);
return crec;
}
mp_limb_t
mpn_addmul_1_d(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_intrinsic(0, r0, p0, &p0);
p1 += c;
c = n_addc_intrinsic(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
mp_limb_t
mpn_addmul_1_e(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm_mulx(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_intrinsic(0, r0, p0, &p0);
p1 += c;
c = n_addc_intrinsic(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
mp_limb_t
mpn_addmul_1_f(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm_int128(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_intrinsic(0, r0, p0, &p0);
p1 += c;
c = n_addc_intrinsic(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
mp_limb_t
mpn_addmul_1_g(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_builtin(0, r0, p0, &p0);
p1 += c;
c = n_addc_builtin(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
mp_limb_t
mpn_addmul_1_h(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm_mulx(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_builtin(0, r0, p0, &p0);
p1 += c;
c = n_addc_builtin(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
mp_limb_t
mpn_addmul_1_i(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
{
mp_limb_t u0, crec, c, p1, p0, r0;
crec = 0;
slong i;
for (i = 0; i < n; i++)
{
u0 = up[i];
umul_ppmm_int128(p1, p0, u0, v0);
r0 = rp[i];
c = n_addc_builtin(0, r0, p0, &p0);
p1 += c;
c = n_addc_builtin(0, p0, crec, &r0);
crec = p1 + c;
rp[i] = r0;
}
return crec;
}
#define NMAX 256
int main()
{
mp_limb_t X[NMAX], Y[NMAX], Z[NMAX], d, c = 0;
slong i, N;
double tcpu, twall, twall_gmp;
flint_rand_t state;
flint_randinit(state);
for (i = 0; i < 256; i++)
{
X[i] = n_randlimb(state);
Y[i] = n_randlimb(state);
Z[i] = n_randlimb(state);
}
d = n_randlimb(state);
for (N = 1; N <= NMAX; N = FLINT_MAX(N + 1, N + N / 2))
{
// N = 1;
for (i = 0; i < 5; i++)
{
flint_printf("%5wd", N);
TIMEIT_START
c += mpn_addmul_1(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall_gmp)
TIMEIT_START
c += mpn_addmul_1_a(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_b(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_c(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_d(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_e(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_f(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_g(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_h(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
TIMEIT_START
c += mpn_addmul_1_i(Y, X, N, d);
d ^= c;
TIMEIT_STOP_VALUES(tcpu, twall)
flint_printf(" %6.3g", twall / twall_gmp);
flint_printf("\n");
}
}
flint_printf("c = %wu\n", c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment