tthsqe12/fq_nmod_mpoly_mul.txt

## fq_nmod_mpoly_mul.txt
#include "n_poly.h"
#include "nmod_vec.h"


#define MAC3(h, m, l, a, b)                         \
{                                                   \
    mp_limb_t p1, p0;                               \
    umul_ppmm(p1, p0, a, b);                        \
    add_sssaaaaaa(h, m, l, h, m, l, 0, p1, p0);     \
}


#define MAC2(h, l, a, b)            \
{                                   \
    mp_limb_t p1, p0;               \
    umul_ppmm(p1, p0, a, b);        \
    add_ssaaaa(h, l, h, l, p1, p0); \
}


void n_fq_print_pretty(
    const mp_limb_t * a,
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);
    slong i;
    int first;

    first = 1;
    for (i = d - 1; i >= 0; i--)
    {
        if (a[i] == 0)
            continue;

        if (!first)
            flint_printf("+");

        first = 0;
        flint_printf("%wu", a[i]);

        if (i > 0)
        {
            flint_printf("*%s", ctx->var);
            if (i > 1)
                flint_printf("^%wd", i);
        }
    }

    if (first)
        flint_printf("0");
}


void n_fq_get_fq_nmod(
    fq_nmod_t a,
    const mp_limb_t * b,
    const fq_nmod_ctx_t ctx)
{
    slong i;
    slong d = fq_nmod_ctx_degree(ctx);

    nmod_poly_fit_length(a, d);

    for (i = 0; i < d; i++)
        a->coeffs[i] = b[i];

    a->length = d;
    _nmod_poly_normalise(a);
}

void n_fq_set_fq_nmod(
    mp_limb_t * a,
    const fq_nmod_t b,
    const fq_nmod_ctx_t ctx)
{
    slong i;
    slong d = fq_nmod_ctx_degree(ctx);

    FLINT_ASSERT(b->length <= d);

    for (i = 0; i < d; i++)
        a[i] = i < b->length ? b->coeffs[i] : 0;
}

void n_fq_add_si(
    mp_limb_t * a,
    const mp_limb_t * b,
    slong c,
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);

    if (a != b)
        _nmod_vec_set(a, b, d);

    if (c < 0)
    {
        ulong cc = -c;
        if (cc >= ctx->mod.n)
            NMOD_RED(cc, cc, ctx->mod);
        a[0] = nmod_sub(a[0], cc, ctx->mod);
    }
    else
    {
        ulong cc = c;
        if (cc >= ctx->mod.n)
            NMOD_RED(cc, cc, ctx->mod);
        a[0] = nmod_add(a[0], cc, ctx->mod);
    }
}

void _n_fq_reduce(
    mp_limb_t * a,
    mp_limb_t * b,
    slong blen,
    const fq_nmod_ctx_t ctx,
    mp_limb_t * t)  /* length 2d */
{
    slong i, j, k, deg = ctx->modulus->length - 1;
    slong d = ctx->j[ctx->len - 1];

    FLINT_ASSERT(a != b);

    FLINT_ASSERT(0 <= blen && blen <= 2*d - 1);
    FLINT_ASSERT(blen == 0 || b[blen - 1] != 0);

    if (blen <= deg)
    {
        for (i = 0; i < blen; i++)
            a[i] = b[i];
        for (i = blen; i < deg; i++)
            a[i] = 0;
    }
    else if (ctx->sparse_modulus)
    {
        nmod_t mod = ctx->mod;

        for (k = ctx->len - 2; k >= 0; k--)
            t[k] = mod.n - ctx->a[k];

        for (i = blen - 1; i >= d; i--)
        {
            for (k = ctx->len - 2; k >= 0; k--)
                NMOD_ADDMUL(b[ctx->j[k] + i - d], b[i], t[k], mod);

            b[i] = 0;
        }

        for (i = 0; i < deg; i++)
            a[i] = b[i];
    }
    else
    {
/*
        _nmod_poly_divrem_newton_n_preinv(t, a, b, blen,
                                          ctx->modulus->coeffs, ctx->modulus->length,
                                          ctx->inv->coeffs, ctx->inv->length,
                                          ctx->mod);
*/
        mp_limb_t * Q = t;
        mp_limb_t * R = a;
        const mp_limb_t * A = b;
        slong lenA = blen;
        const mp_limb_t * B = ctx->modulus->coeffs;
        slong lenB = deg + 1;
        const mp_limb_t * Binv = ctx->inv->coeffs;
        slong lenBinv = ctx->inv->length;

        const slong lenQ = lenA - lenB + 1;

        FLINT_ASSERT(lenBinv > 0);
        FLINT_ASSERT(lenQ > 0);

        if (lenQ <= 2)
        {
            if (lenQ == 2)
                _nmod_poly_divrem_q1(Q, R, A, lenA, B, lenB, ctx->mod);
            else
                _nmod_poly_divrem_q0(Q, R, A, B, lenB, ctx->mod);
            return;
        }

        if (deg < 20)
        {
            for (i = 0; i < lenQ; i++)
            {
                mp_limb_t t2 = 0, t1 = 0, t0 = 0;
                j = FLINT_MAX(0, i - lenBinv + 1);
                umul_ppmm(t1, t0, A[lenA - 1 - j], Binv[i - j]);
                for (j++; j <= i; j++)
                    MAC3(t2, t1, t0, A[lenA - 1 - j], Binv[i - j]);
                NMOD_RED3(Q[lenQ - 1 - i], t2, t1, t0, ctx->mod);
            }

            for (i = 0; i < deg; i++)
            {
                mp_limb_t t2 = 0, t1 = 0, t0 = 0;
                for (j = FLINT_MAX(0, i - lenQ + 1); j <= i; j++)
                    MAC3(t2, t1, t0, B[j], Q[i - j]);
                NMOD_RED3(t0, t2, t1, t0, ctx->mod);
                R[i] = nmod_sub(A[i], t0, ctx->mod);
            }
        }
        else
        {
            mp_ptr Arev = t + d;

            _nmod_poly_reverse(Arev, A + (lenA - lenQ), lenQ, lenQ);

            _nmod_poly_mullow(Q, Arev, lenQ, Binv, FLINT_MIN(lenQ, lenBinv), lenQ, ctx->mod);

            _nmod_poly_reverse(Q, Q, lenQ, lenQ);

            FLINT_ASSERT(lenB > 1);
            FLINT_ASSERT(lenQ < lenB - 1);

            _nmod_poly_mullow(R, B, lenB - 1, Q, lenQ, lenB - 1, ctx->mod);

            _nmod_vec_sub(R, A, R, lenB - 1, ctx->mod);
        }
    }
}
void _n_fq_madd2(
    mp_limb_t * a,          /* length 2d */
    const mp_limb_t * b,    /* length d */
    const mp_limb_t * c,    /* length d */
    const fq_nmod_ctx_t ctx,
    mp_limb_t * t)          /* length 2d */
{
    slong d = ctx->modulus->length - 1;
    FLINT_ASSERT(d > 0);
    if (d < 30)
    {
        slong i, j;
        for (i = 0; i + 1 < d; i++)
        {
            mp_limb_t t2 = 0, t1 = 0, t0 = 0;
            mp_limb_t s2 = 0, s1 = 0, s0 = 0;
            umul_ppmm(t1, t0, b[i], c[0]);
            umul_ppmm(s1, s0, b[d - 1], c[d - 1 - i]);

            add_ssaaaa(t1, t0, t1, t0, 0, a[i]);
            add_ssaaaa(s1, s0, s1, s0, 0, a[2*d - 2 - i]);

            for (j = 1; j <= i; j++)
            {
                MAC3(t2, t1, t0, b[i - j], c[0 + j]);
                MAC3(s2, s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
            }
            NMOD_RED3(a[i], t2, t1, t0, ctx->mod);
            NMOD_RED3(a[2*d - 2 - i], s2, s1, s0, ctx->mod);
        }

        {
            mp_limb_t t2 = 0, t1 = 0, t0 = 0;
            umul_ppmm(t1, t0, b[d - 1], c[0]);
            add_ssaaaa(t1, t0, t1, t0, 0, a[d - 1]);
            for (j = 1; j < d; j++)
            {
                MAC3(t2, t1, t0, b[d - 1 - j], c[0 + j]);
            }
            NMOD_RED3(a[d - 1], t2, t1, t0, ctx->mod);
        }
    }
    else
    {
        _nmod_poly_mul(t, b, d, c, d, ctx->mod);
        _nmod_vec_add(a, a, t, 2*d - 1, ctx->mod);
    }
}

void _n_fq_mul2(
    mp_limb_t * a,          /* length 2d */
    const mp_limb_t * b,    /* length d */
    const mp_limb_t * c,    /* length d */
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);
    FLINT_ASSERT(d > 0);
    if (d < 30)
    {
        slong i, j;
        for (i = 0; i + 1 < d; i++)
        {
            mp_limb_t t2 = 0, t1 = 0, t0 = 0;
            mp_limb_t s2 = 0, s1 = 0, s0 = 0;
            umul_ppmm(t1, t0, b[i], c[0]);
            umul_ppmm(s1, s0, b[d - 1], c[d - 1 - i]);
            for (j = 1; j <= i; j++)
            {
                MAC3(t2, t1, t0, b[i - j], c[0 + j]);
                MAC3(s2, s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
            }
            NMOD_RED3(a[i], t2, t1, t0, ctx->mod);
            NMOD_RED3(a[2*d - 2 - i], s2, s1, s0, ctx->mod);
        }

        {
            mp_limb_t t2 = 0, t1 = 0, t0 = 0;
            umul_ppmm(t1, t0, b[d - 1], c[0]);
            for (j = 1; j < d; j++)
            {
                MAC3(t2, t1, t0, b[d - 1 - j], c[0 + j]);
            }
            NMOD_RED3(a[d - 1], t2, t1, t0, ctx->mod);
        }
    }
    else
    {
        _nmod_poly_mul(a, b, d, c, d, ctx->mod);
    }
}

/**************************** lazy *******************************************/

int _n_fq_dot_lazy_size(
    slong len,
    const fq_nmod_ctx_t ctx)
{
    ulong t[4];
    slong d = fq_nmod_ctx_degree(ctx);
    mp_limb_t p = ctx->mod.n;

    if (d > 30 || p < 2 || len < 0)
        return 0;

    umul_ppmm(t[1], t[0], p - 1, p - 1);
    t[2] = mpn_mul_1(t, t, 2, d);
    t[3] = mpn_mul_1(t, t, 3, len);

    if (t[3] != 0)
        return 0;
    if (t[2] != 0)
        return 3;
    if (t[1] != 0)
        return 2;
    return 1;
}


void _n_fq_reduce2_lazy1(
    mp_limb_t * a, /* length 6d, 2d used */
    slong d,
    nmod_t ctx)
{
    slong i;
    for (i = 0; i < 2*d - 1; i++)
        NMOD_RED(a[i], a[i], ctx);
}

void _n_fq_madd2_lazy1(
    mp_limb_t * a,       /* length 6d, 2d used */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i, j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t0 = 0;
        mp_limb_t s0 = 0;
        t0 = a[i + 0];
        s0 = a[(2*d - 2 - i) + 0];
        t0 += b[i]*c[0];
        s0 += b[d - 1]*c[d - 1 - i];
        for (j = 1; j <= i; j++)
        {
            t0 += b[i - j]*c[0 + j];
            s0 += b[d - 1 - j]*c[d - 1 - i + j];
        }
        a[i + 0] = t0;
        a[(2*d - 2 - i) + 0] = s0;
    }

    {
        mp_limb_t t0 = 0;
        t0 = a[(d - 1) + 0];
        t0 += b[d - 1]*c[0];
        for (j = 1; j < d; j++)
        {
            t0 += b[d - 1 - j]*c[0 + j];
        }
        a[(d - 1) + 0] = t0;
    }
}


void _n_fq_mul2_lazy1(
    mp_limb_t * a,       /* length 6d, 2d used */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i,j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t0 = 0;
        mp_limb_t s0 = 0;
        t0 = b[i]*c[0];
        s0 = b[d - 1]*c[d - 1 - i];
        for (j = 1; j <= i; j++)
        {
            t0 += b[i - j]*c[0 + j];
            s0 += b[d - 1 - j]*c[d - 1 - i + j];
        }
        a[i + 0] = t0;
        a[(2*d - 2 - i) + 0] = s0;
    }

    {
        mp_limb_t t0 = 0;
        t0 = b[d - 1]*c[0];
        for (j = 1; j < d; j++)
        {
            t0 += b[d - 1 - j]*c[0 + j];
        }
        a[(d - 1) + 0] = t0;
    }
}


void _n_fq_reduce2_lazy2(
    mp_limb_t * a, /* length 6d, 4d used */
    slong d,
    nmod_t ctx)
{
    slong i;
    for (i = 0; i < 2*d - 1; i++)
        NMOD2_RED2(a[i], a[2*i + 1], a[2*i + 0], ctx);
}

void _n_fq_madd2_lazy2(
    mp_limb_t * a,       /* length 6d, 4d used */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i,j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t1 = 0, t0 = 0;
        mp_limb_t s1 = 0, s0 = 0;
        t0 = a[2*i + 0];
        t1 = a[2*i + 1];
        s0 = a[2*(2*d - 2 - i) + 0];
        s1 = a[2*(2*d - 2 - i) + 1];
        MAC2(t1, t0, b[i], c[0]);
        MAC2(s1, s0, b[d - 1], c[d - 1 - i]);
        for (j = 1; j <= i; j++)
        {
            MAC2(t1, t0, b[i - j], c[0 + j]);
            MAC2(s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
        }
        a[2*i + 0] = t0;
        a[2*i + 1] = t1;
        a[2*(2*d - 2 - i) + 0] = s0;
        a[2*(2*d - 2 - i) + 1] = s1;
    }

    {
        mp_limb_t t1 = 0, t0 = 0;
        t0 = a[2*(d - 1) + 0];
        t1 = a[2*(d - 1) + 1];
        MAC2(t1, t0, b[d - 1], c[0]);
        for (j = 1; j < d; j++)
        {
            MAC2(t1, t0, b[d - 1 - j], c[0 + j]);
        }
        a[2*(d - 1) + 0] = t0;
        a[2*(d - 1) + 1] = t1;
    }
}


void _n_fq_mul2_lazy2(
    mp_limb_t * a,       /* length 6d */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i,j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t1 = 0, t0 = 0;
        mp_limb_t s1 = 0, s0 = 0;
        umul_ppmm(t1, t0, b[i], c[0]);
        umul_ppmm(s1, s0, b[d - 1], c[d - 1 - i]);
        for (j = 1; j <= i; j++)
        {
            MAC2(t1, t0, b[i - j], c[0 + j]);
            MAC2(s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
        }
        a[2*i + 0] = t0;
        a[2*i + 1] = t1;
        a[2*(2*d - 2 - i) + 0] = s0;
        a[2*(2*d - 2 - i) + 1] = s1;
    }

    {
        mp_limb_t t1 = 0, t0 = 0;
        umul_ppmm(t1, t0, b[d - 1], c[0]);
        for (j = 1; j < d; j++)
        {
            MAC2(t1, t0, b[d - 1 - j], c[0 + j]);
        }
        a[2*(d - 1) + 0] = t0;
        a[2*(d - 1) + 1] = t1;
    }
}


void _n_fq_reduce2_lazy3(
    mp_limb_t * a, /* length 6d */
    slong d,
    nmod_t ctx)
{
    slong i;
    for (i = 0; i < 2*d - 1; i++)
        NMOD_RED3(a[i], a[3*i + 2], a[3*i + 1], a[3*i + 0], ctx);
}

void _n_fq_madd2_lazy3(
    mp_limb_t * a,       /* length 6d */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i,j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t2 = 0, t1 = 0, t0 = 0;
        mp_limb_t s2 = 0, s1 = 0, s0 = 0;
        t0 = a[3*i + 0];
        t1 = a[3*i + 1];
        t2 = a[3*i + 2];
        s0 = a[3*(2*d - 2 - i) + 0];
        s1 = a[3*(2*d - 2 - i) + 1];
        s2 = a[3*(2*d - 2 - i) + 2];
        MAC3(t2, t1, t0, b[i], c[0]);
        MAC3(s2, s1, s0, b[d - 1], c[d - 1 - i]);
        for (j = 1; j <= i; j++)
        {
            MAC3(t2, t1, t0, b[i - j], c[0 + j]);
            MAC3(s2, s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
        }
        a[3*i + 0] = t0;
        a[3*i + 1] = t1;
        a[3*i + 2] = t2;
        a[3*(2*d - 2 - i) + 0] = s0;
        a[3*(2*d - 2 - i) + 1] = s1;
        a[3*(2*d - 2 - i) + 2] = s2;
    }

    {
        mp_limb_t t2 = 0, t1 = 0, t0 = 0;
        t0 = a[3*(d - 1) + 0];
        t1 = a[3*(d - 1) + 1];
        t2 = a[3*(d - 1) + 2];
        MAC3(t2, t1, t0, b[d - 1], c[0]);
        for (j = 1; j < d; j++)
        {
            MAC3(t2, t1, t0, b[d - 1 - j], c[0 + j]);
        }
        a[3*(d - 1) + 0] = t0;
        a[3*(d - 1) + 1] = t1;
        a[3*(d - 1) + 2] = t2;
    }
}


void _n_fq_mul2_lazy3(
    mp_limb_t * a,       /* length 6d */
    const mp_limb_t * b, /* length d */
    const mp_limb_t * c, /* length d */
    slong d)
{
    slong i,j;

    for (i = 0; i + 1 < d; i++)
    {
        mp_limb_t t2 = 0, t1 = 0, t0 = 0;
        mp_limb_t s2 = 0, s1 = 0, s0 = 0;
        umul_ppmm(t1, t0, b[i], c[0]);
        umul_ppmm(s1, s0, b[d - 1], c[d - 1 - i]);
        for (j = 1; j <= i; j++)
        {
            MAC3(t2, t1, t0, b[i - j], c[0 + j]);
            MAC3(s2, s1, s0, b[d - 1 - j], c[d - 1 - i + j]);
        }
        a[3*i + 0] = t0;
        a[3*i + 1] = t1;
        a[3*i + 2] = t2;
        a[3*(2*d - 2 - i) + 0] = s0;
        a[3*(2*d - 2 - i) + 1] = s1;
        a[3*(2*d - 2 - i) + 2] = s2;
    }

    {
        mp_limb_t t2 = 0, t1 = 0, t0 = 0;
        umul_ppmm(t1, t0, b[d - 1], c[0]);
        for (j = 1; j < d; j++)
        {
            MAC3(t2, t1, t0, b[d - 1 - j], c[0 + j]);
        }
        a[3*(d - 1) + 0] = t0;
        a[3*(d - 1) + 1] = t1;
        a[3*(d - 1) + 2] = t2;
    }
}

void _n_fq_inv(
    mp_limb_t * a,
    const mp_limb_t * b,
    const fq_nmod_ctx_t ctx,
    mp_limb_t * t) /* length d */
{
    slong d = ctx->modulus->length - 1;
    slong blen = d;
    FLINT_ASSERT(d > 0);

    while (blen > 0 && b[blen - 1] == 0)
        blen--;

    if (blen < 1)
    {
        flint_throw(FLINT_ERROR, "impossible inverse in _fq_nmod_inv");
    }
    else if (blen == 1)
    {
        a[0] = n_invmod(b[0], ctx->mod.n);
        _nmod_vec_zero(a + 1, d - 1);
    }
    else
    {
        if (1 != _nmod_poly_gcdinv(t, a, b, blen, ctx->modulus->coeffs, d + 1, ctx->mod))
        {
            flint_throw(FLINT_ERROR, "impossible inverse in _fq_nmod_inv");
        }

        if (t[0] != 1)
        {
            _nmod_vec_scalar_mul_nmod(a, a, d, n_invmod(t[0], ctx->mod.n), ctx->mod);
        }
    }
}

void n_fq_mul(
    mp_limb_t * a,
    const mp_limb_t * b,
    const mp_limb_t * c,
    const fq_nmod_ctx_t ctx)
{
    fq_nmod_t A, B, C;
    fq_nmod_init(A, ctx);
    fq_nmod_init(B, ctx);
    fq_nmod_init(C, ctx);
    n_fq_get_fq_nmod(B, b, ctx);
    n_fq_get_fq_nmod(C, c, ctx);
    fq_nmod_mul(A, B, C, ctx);
    n_fq_set_fq_nmod(a, A, ctx);
    fq_nmod_clear(A, ctx);
    fq_nmod_clear(B, ctx);
    fq_nmod_clear(C, ctx);
}

void n_fq_inv(
    mp_limb_t * a,
    const mp_limb_t * b,
    const fq_nmod_ctx_t ctx)
{
    fq_nmod_t A, B;
    fq_nmod_init(A, ctx);
    fq_nmod_init(B, ctx);
    n_fq_get_fq_nmod(B, b, ctx);
    fq_nmod_inv(A, B, ctx);
    n_fq_set_fq_nmod(a, A, ctx);
    fq_nmod_clear(A, ctx);
    fq_nmod_clear(B, ctx);
}

void _n_fq_pow_ui(
    mp_limb_t * a,
    const mp_limb_t * b,
    ulong e,
    const fq_nmod_ctx_t ctx)
{
    fq_nmod_t A, B;
    fq_nmod_init(A, ctx);
    fq_nmod_init(B, ctx);
    n_fq_get_fq_nmod(B, b, ctx);
    fq_nmod_pow_ui(A, B, e, ctx);
    n_fq_set_fq_nmod(a, A, ctx);
    fq_nmod_clear(A, ctx);
    fq_nmod_clear(B, ctx);
}

void n_fq_pow_ui(
    mp_limb_t * a,
    const mp_limb_t * b,
    ulong e,
    const fq_nmod_ctx_t ctx)
{
    fq_nmod_t A, B;
    fq_nmod_init(A, ctx);
    fq_nmod_init(B, ctx);
    n_fq_get_fq_nmod(B, b, ctx);
    fq_nmod_pow_ui(A, B, e, ctx);
    n_fq_set_fq_nmod(a, A, ctx);
    fq_nmod_clear(A, ctx);
    fq_nmod_clear(B, ctx);
}

int n_fq_is_canonical(
    const mp_limb_t * a,
    const fq_nmod_ctx_t ctx)
{
    slong i, d = fq_nmod_ctx_degree(ctx);
    for (i = 0; i < d; i++)
    {
        if (a[i] >= ctx->mod.n)
            return 0;
    }
    return 1;
}


#include "fq_nmod_mpoly.h"
#include "n_poly.h"

#undef _WIN32_WINNT
#define _WIN32_WINNT 0x0600
#include <conio.h>
#include <windows.h>

typedef struct {
    mp_limb_t * coeffs;
    ulong * exps;
    slong length;
    flint_bitcnt_t bits;    /* number of bits per exponent */
    slong coeffs_alloc;     /* abs size in mp_limb_t units */
    slong exps_alloc;       /* abs size in ulong units */
} new_fq_nmod_mpoly_struct;

typedef new_fq_nmod_mpoly_struct new_fq_nmod_mpoly_t[1];


void new_fq_nmod_mpoly_init(
    new_fq_nmod_mpoly_t A,
    const fq_nmod_mpoly_ctx_t ctx)
{
    A->coeffs = NULL;
    A->exps = NULL;
    A->length = 0;
    A->bits = MPOLY_MIN_BITS;
    A->coeffs_alloc = 0;
    A->exps_alloc = 0;
}

void new_fq_nmod_mpoly_clear(
    new_fq_nmod_mpoly_t A,
    const fq_nmod_mpoly_ctx_t ctx)
{
    if (A->coeffs_alloc > 0)
        flint_free(A->coeffs);

    if (A->exps_alloc > 0)
        flint_free(A->exps);
}

void new_fq_nmod_mpoly_swap(
    new_fq_nmod_mpoly_t A,
    new_fq_nmod_mpoly_t B,
    const fq_nmod_mpoly_ctx_t ctx)
{
    new_fq_nmod_mpoly_struct t = *A;
    *A = *B;
    *B = t;
}

int n_fq_fprint_pretty(
    FILE * file,
    const mp_limb_t * a,
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);
    slong i;
    int first;

    first = 1;
    for (i = d - 1; i >= 0; i--)
    {
        if (a[i] == 0)
            continue;

        if (!first)
            flint_fprintf(file, "+");

        first = 0;
        flint_fprintf(file, "%wu", a[i]);

        if (i > 0)
        {
            flint_fprintf(file, "*%s", ctx->var);
            if (i > 1)
                flint_fprintf(file, "^%wd", i);
        }
    }

    if (first)
        flint_fprintf(file, "0");

    return 1;
}

int new_fq_nmod_mpoly_fprint_pretty(
    FILE * file,
    const new_fq_nmod_mpoly_t A,
    const char ** x_in,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx->fqctx);
    slong len = A->length;
    ulong * exp = A->exps;
    slong bits = A->bits;
    slong i, j, N;
    fmpz * exponents;
    int r = 0;
    char ** x = (char **) x_in;
    TMP_INIT;

    if (len == 0)
    {
        r = (EOF != fputc('0', file));
        return r;
    }

#define CHECK_r if (r <= 0) goto done;

    N = mpoly_words_per_exp(bits, ctx->minfo);

    TMP_START;

    if (x == NULL)
    {
        x = (char **) TMP_ALLOC(ctx->minfo->nvars*sizeof(char *));
        for (i = 0; i < ctx->minfo->nvars; i++)
        {
            x[i] = (char *) TMP_ALLOC(((FLINT_BITS+4)/3)*sizeof(char));
            flint_sprintf(x[i], "x%wd", i+1);
        }
    }

    exponents = (fmpz *) TMP_ALLOC(ctx->minfo->nvars*sizeof(ulong));
    for (i = 0; i < ctx->minfo->nvars; i++)
        fmpz_init(exponents + i);

    for (i = 0; i < len; i++)
    {
        if (i > 0)
        {
            r = flint_fprintf(file, " + ");
            CHECK_r
        }

        r = flint_fprintf(file, "(");
        CHECK_r
        r = n_fq_fprint_pretty(file, A->coeffs + d*i, ctx->fqctx);
        CHECK_r
        r = flint_fprintf(file, ")");
        CHECK_r

        mpoly_get_monomial_ffmpz(exponents, exp + N*i, bits, ctx->minfo);

        for (j = 0; j < ctx->minfo->nvars; j++)
        {
            int cmp = fmpz_cmp_ui(exponents + j, 1);

            if (cmp > 0)
            {
                r = flint_fprintf(file, "*%s^", x[j]);
                CHECK_r
                r = fmpz_fprint(file, exponents + j);
                CHECK_r
            }
            else if (cmp == 0)
            {
                r = flint_fprintf(file, "*%s", x[j]);
                CHECK_r
            }
        }
    }

done:

    for (i = 0; i < ctx->minfo->nvars; i++)
        fmpz_clear(exponents + i);

    TMP_END;

    return r;
}

void new_fq_nmod_mpoly_print_pretty(
    const new_fq_nmod_mpoly_t A,
    const char ** x,
    const fq_nmod_mpoly_ctx_t ctx)
{
    new_fq_nmod_mpoly_fprint_pretty(stdout, A, x, ctx);
    fflush(stdout);
}


void _new_fq_nmod_mpoly_fit_length(
    mp_limb_t ** coeffs,
    slong * coeffs_alloc,
    slong d,
    ulong ** exps,
    slong * exps_alloc,
    slong N,
    slong length)
{
    if (d*length > *coeffs_alloc)
    {
        *coeffs_alloc = FLINT_MAX(d*length, *coeffs_alloc*2);
        *coeffs = flint_realloc(*coeffs, *coeffs_alloc*sizeof(mp_limb_t));
    }

    if (N*length > *exps_alloc)
    {
        *exps_alloc = FLINT_MAX(N*length, *exps_alloc*2);
        *exps = flint_realloc(*exps, *exps_alloc*sizeof(ulong));
    }
}


void new_fq_nmod_mpoly_fit_length_set_bits(
    new_fq_nmod_mpoly_t A,
    slong length,
    slong bits,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx->fqctx);
    slong N = mpoly_words_per_exp(bits, ctx->minfo);

    _new_fq_nmod_mpoly_fit_length(&A->coeffs, &A->coeffs_alloc, d,
                                  &A->exps, &A->exps_alloc, N, length);
    A->bits = bits;
}

void new_fq_nmod_mpoly_zero(
    new_fq_nmod_mpoly_t A,
    const fq_nmod_mpoly_ctx_t ctx)
{
    A->length = 0;
}

void new_fq_nmod_mpoly_set_fq_nmod_mpoly(
    new_fq_nmod_mpoly_t A,
    const fq_nmod_mpoly_t B,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx->fqctx);
    slong N = mpoly_words_per_exp(B->bits, ctx->minfo);
    slong i;
    new_fq_nmod_mpoly_fit_length_set_bits(A, B->length, B->bits, ctx);
    mpoly_copy_monomials(A->exps, B->exps, B->length, N);
    for (i = 0; i < B->length; i++)
        n_fq_set_fq_nmod(A->coeffs + d*i, B->coeffs + i, ctx->fqctx);
    A->length = B->length;
}

void new_fq_nmod_mpoly_get_fq_nmod_mpoly(
    fq_nmod_mpoly_t A,
    const new_fq_nmod_mpoly_t B,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx->fqctx);
    slong N = mpoly_words_per_exp(B->bits, ctx->minfo);
    slong i;
    fq_nmod_mpoly_fit_bits(A, B->bits, ctx);
    A->bits = B->bits;
    fq_nmod_mpoly_fit_length(A, B->length, ctx);
    mpoly_copy_monomials(A->exps, B->exps, B->length, N);
    for (i = 0; i < B->length; i++)
        n_fq_get_fq_nmod(A->coeffs + i, B->coeffs + d*i, ctx->fqctx);
    A->length = B->length;
}


int new_fq_nmod_mpoly_equal(
    const new_fq_nmod_mpoly_t A,
    const new_fq_nmod_mpoly_t B,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx->fqctx);

    if (A == B)
        return 1;

    if (A->length != B->length)
        return 0;

    if (!_nmod_vec_equal(A->coeffs, B->coeffs, d*B->length))
        return 0;

    return !mpoly_monomials_cmp(A->exps, A->bits, B->exps, B->bits,
                                                        B->length, ctx->minfo);
}


void _new_fq_nmod_mpoly_mul_johnson1(
    new_fq_nmod_mpoly_t A,
    const mp_limb_t * Bcoeffs,
    const ulong * Bexps,
    slong Blen,
    const mp_limb_t * Ccoeffs,
    const ulong * Cexps,
    slong Clen,
    ulong maskhi,
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);
    slong i, j;
    slong next_loc;
    slong heap_len = 2; /* heap zero index unused */
    mpoly_heap1_s * heap;
    mpoly_heap_t * chain;
    slong * store, * store_base;
    mpoly_heap_t * x;
    slong * hind;
    ulong exp;
    mp_limb_t * t;
    int lazy_size = _n_fq_dot_lazy_size(Blen, ctx);
    mp_limb_t * Acoeffs = A->coeffs;
    ulong * Aexps = A->exps;
    slong Acoeffs_alloc = A->coeffs_alloc;
    slong Aexps_alloc = A->exps_alloc;
    slong Alen;
    TMP_INIT;

    TMP_START;

    next_loc = Blen + 4;   /* something bigger than heap can ever be */
    heap = (mpoly_heap1_s *) TMP_ALLOC((Blen + 1)*sizeof(mpoly_heap1_s));
    chain = (mpoly_heap_t *) TMP_ALLOC(Blen*sizeof(mpoly_heap_t));
    store = store_base = (slong *) TMP_ALLOC(2*Blen*sizeof(slong));
    hind = (slong *) TMP_ALLOC(Blen*sizeof(slong));
    t = (mp_limb_t *) TMP_ALLOC(6*d*sizeof(mp_limb_t));

    for (i = 0; i < Blen; i++)
        hind[i] = 1;

    /* put (0, 0, exp2[0] + exp3[0]) on heap */
    x = chain + 0;
    x->i = 0;
    x->j = 0;
    x->next = NULL;

    HEAP_ASSIGN(heap[1], Bexps[0] + Cexps[0], x);
    hind[0] = 2*1 + 0;

    Alen = 0;
    while (heap_len > 1)
    {
        exp = heap[1].exp;

        _new_fq_nmod_mpoly_fit_length(&Acoeffs, &Acoeffs_alloc, d,
                                      &Aexps, &Aexps_alloc, 1, Alen + 1);

        Aexps[Alen] = exp;

        _nmod_vec_zero(t, 6*d);

        switch (lazy_size)
        {
        case 1:
            do {
                x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy1(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && heap[1].exp == exp);
            _n_fq_reduce2_lazy1(t, d, ctx->mod);
            break;

        case 2:
            do {
                x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy2(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && heap[1].exp == exp);
            _n_fq_reduce2_lazy2(t, d, ctx->mod);
            break;

        case 3:
            do {
                x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy3(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && heap[1].exp == exp);
            _n_fq_reduce2_lazy3(t, d, ctx->mod);
            break;

        default:
            do {
                x = _mpoly_heap_pop1(heap, &heap_len, maskhi);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2(t, Bcoeffs + d*x->i,
                                   Ccoeffs + d*x->j, ctx, t + 2*d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && heap[1].exp == exp);
            break;
        }

        _n_fq_reduce2(Acoeffs + d*Alen, t, ctx, t + 2*d);
        Alen += !_n_fq_is_zero(Acoeffs + d*Alen, d);

        while (store > store_base)
        {
            j = *--store;
            i = *--store;

            /* should we go right? */
            if ((i + 1 < Blen) &&
                (hind[i + 1] == 2*j + 1)
               )
            {
                x = chain + i + 1;
                x->i = i + 1;
                x->j = j;
                x->next = NULL;

                hind[x->i] = 2*(x->j + 1) + 0;
                _mpoly_heap_insert1(heap, Bexps[x->i] + Cexps[x->j], x,
                                                 &next_loc, &heap_len, maskhi);
            }

            /* should we go up? */
            if ((j + 1 < Clen) &&
                ((hind[i] & 1) == 1) &&
                ((i == 0) || (hind[i - 1] >= 2*(j + 2) + 1))
               )
            {
                x = chain + i;
                x->i = i;
                x->j = j + 1;
                x->next = NULL;

                hind[x->i] = 2*(x->j + 1) + 0;
                _mpoly_heap_insert1(heap, Bexps[x->i] + Cexps[x->j], x,
                                                 &next_loc, &heap_len, maskhi);
            }
        }
    }

    A->coeffs = Acoeffs;
    A->exps = Aexps;
    A->coeffs_alloc = Acoeffs_alloc;
    A->exps_alloc = Aexps_alloc;
    A->length = Alen;

    TMP_END;
}


void _new_fq_nmod_mpoly_mul_johnson(
    new_fq_nmod_mpoly_t A,
    const mp_limb_t * Bcoeffs,
    const ulong * Bexps,
    slong Blen,
    const mp_limb_t * Ccoeffs,
    const ulong * Cexps,
    slong Clen,
    flint_bitcnt_t bits,
    slong N,
    const ulong * cmpmask,
    const fq_nmod_ctx_t ctx)
{
    slong d = fq_nmod_ctx_degree(ctx);
    slong i, j;
    slong next_loc;
    slong heap_len = 2; /* heap zero index unused */
    mpoly_heap_s * heap;
    mpoly_heap_t * chain;
    slong * store, * store_base;
    mpoly_heap_t * x;
    ulong * exp, * exps;
    ulong ** exp_list;
    slong exp_next;
    slong * hind;
    mp_limb_t * t;
    int lazy_size = _n_fq_dot_lazy_size(Blen, ctx);
    mp_limb_t * Acoeffs = A->coeffs;
    ulong * Aexps = A->exps;
    slong Acoeffs_alloc = A->coeffs_alloc;
    slong Aexps_alloc = A->exps_alloc;
    slong Alen;
    TMP_INIT;

    FLINT_ASSERT(Blen > 0);
    FLINT_ASSERT(Clen > 0);
    FLINT_ASSERT(A->bits == bits);

    if (N == 1)
    {
        _new_fq_nmod_mpoly_mul_johnson1(A, Bcoeffs, Bexps, Blen,
                                        Ccoeffs, Cexps, Clen, cmpmask[0], ctx);
        return;
    }

    TMP_START;

    next_loc = Blen + 4;   /* something bigger than heap can ever be */
    heap = (mpoly_heap_s *) TMP_ALLOC((Blen + 1)*sizeof(mpoly_heap_s));
    chain = (mpoly_heap_t *) TMP_ALLOC(Blen*sizeof(mpoly_heap_t));
    store = store_base = (slong *) TMP_ALLOC(2*Blen*sizeof(slong));
    exps = (ulong *) TMP_ALLOC(Blen*N*sizeof(ulong));
    exp_list = (ulong **) TMP_ALLOC(Blen*sizeof(ulong *));
    hind = (slong *) TMP_ALLOC(Blen*sizeof(slong));
    t = (mp_limb_t *) TMP_ALLOC(6*d*sizeof(mp_limb_t));

    for (i = 0; i < Blen; i++)
    {
        exp_list[i] = exps + i*N;
        hind[i] = 1;
    }

    /* start with no heap nodes and no exponent vectors in use */
    exp_next = 0;

    /* put (0, 0, exp2[0] + exp3[0]) on heap */
    x = chain + 0;
    x->i = 0;
    x->j = 0;
    x->next = NULL;

    heap[1].next = x;
    heap[1].exp = exp_list[exp_next++];

    if (bits <= FLINT_BITS)
        mpoly_monomial_add(heap[1].exp, Bexps + N*0, Cexps + N*0, N);
    else
        mpoly_monomial_add_mp(heap[1].exp, Bexps + N*0, Cexps + N*0, N);

    hind[0] = 2*1 + 0;

    Alen = 0;
    while (heap_len > 1)
    {
        exp = heap[1].exp;

        _new_fq_nmod_mpoly_fit_length(&Acoeffs, &Acoeffs_alloc, d,
                                      &Aexps, &Aexps_alloc, N, Alen + 1);

        mpoly_monomial_set(Aexps + N*Alen, exp, N);

        _nmod_vec_zero(t, 6*d);

        switch (lazy_size)
        {
        case 1:
            do {
                exp_list[--exp_next] = heap[1].exp;
                x = _mpoly_heap_pop(heap, &heap_len, N, cmpmask);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy1(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
            _n_fq_reduce2_lazy1(t, d, ctx->mod);
            break;

        case 2:
            do {
                exp_list[--exp_next] = heap[1].exp;
                x = _mpoly_heap_pop(heap, &heap_len, N, cmpmask);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy2(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
            _n_fq_reduce2_lazy2(t, d, ctx->mod);
            break;

        case 3:
            do {
                exp_list[--exp_next] = heap[1].exp;
                x = _mpoly_heap_pop(heap, &heap_len, N, cmpmask);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2_lazy3(t, Bcoeffs + d*x->i, Ccoeffs + d*x->j, d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
            _n_fq_reduce2_lazy3(t, d, ctx->mod);
            break;

        default:
            do {
                exp_list[--exp_next] = heap[1].exp;
                x = _mpoly_heap_pop(heap, &heap_len, N, cmpmask);
                do {
                    hind[x->i] |= WORD(1);
                    *store++ = x->i;
                    *store++ = x->j;
                    _n_fq_madd2(t, Bcoeffs + d*x->i,
                                   Ccoeffs + d*x->j, ctx, t + 2*d);
                } while ((x = x->next) != NULL);
            } while (heap_len > 1 && mpoly_monomial_equal(heap[1].exp, exp, N));
            break;
        }

        _n_fq_reduce2(Acoeffs + d*Alen, t, ctx, t + 2*d);
        Alen += !_n_fq_is_zero(Acoeffs + d*Alen, d);

        while (store > store_base)
        {
            j = *--store;
            i = *--store;

            /* should we go right? */
            if ((i + 1 < Blen) &&
                (hind[i + 1] == 2*j + 1)
               )
            {
                x = chain + i + 1;
                x->i = i + 1;
                x->j = j;
                x->next = NULL;

                hind[x->i] = 2*(x->j + 1) + 0;

                if (bits <= FLINT_BITS)
                    mpoly_monomial_add(exp_list[exp_next], Bexps + x->i*N,
                                                           Cexps + x->j*N, N);
                else
                    mpoly_monomial_add_mp(exp_list[exp_next], Bexps + x->i*N,
                                                              Cexps + x->j*N, N);

                exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
                                             &next_loc, &heap_len, N, cmpmask);
            }

            /* should we go up? */
            if ((j + 1 < Clen) &&
                ((hind[i] & 1) == 1) &&
                ((i == 0) || (hind[i - 1] >= 2*(j + 2) + 1))
               )
            {
                x = chain + i;
                x->i = i;
                x->j = j + 1;
                x->next = NULL;

                hind[x->i] = 2*(x->j + 1) + 0;

                if (bits <= FLINT_BITS)
                    mpoly_monomial_add(exp_list[exp_next], Bexps + x->i*N,
                                                           Cexps + x->j*N, N);
                else
                    mpoly_monomial_add_mp(exp_list[exp_next], Bexps + x->i*N,
                                                              Cexps + x->j*N, N);

                exp_next += _mpoly_heap_insert(heap, exp_list[exp_next], x,
                                             &next_loc, &heap_len, N, cmpmask);
            }
        }
    }

    A->coeffs = Acoeffs;
    A->exps = Aexps;
    A->coeffs_alloc = Acoeffs_alloc;
    A->exps_alloc = Aexps_alloc;
    A->length = Alen;

    TMP_END;
}

void new_fq_nmod_mpoly_mul_johnson(
    new_fq_nmod_mpoly_t A,
    const new_fq_nmod_mpoly_t B,
    const new_fq_nmod_mpoly_t C,
    const fq_nmod_mpoly_ctx_t ctx)
{
    slong i, N, Alen;
    flint_bitcnt_t Abits;
    fmpz * Bmaxfields, * Cmaxfields;
    ulong * cmpmask;
    ulong * Bexps = B->exps, * Cexps = C->exps;
    int freeBexps = 0, freeCexps = 0;
    new_fq_nmod_mpoly_struct * P, T[1];
    TMP_INIT;

    if (B->length < 1 || C->length < 1)
    {
        new_fq_nmod_mpoly_zero(A, ctx);
        return;
    }

    TMP_START;

    Bmaxfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
    Cmaxfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz));
    for (i = 0; i < ctx->minfo->nfields; i++)
    {
        fmpz_init(Bmaxfields + i);
        fmpz_init(Cmaxfields + i);
    }
    mpoly_max_fields_fmpz(Bmaxfields, Bexps, B->length, B->bits, ctx->minfo);
    mpoly_max_fields_fmpz(Cmaxfields, Cexps, C->length, C->bits, ctx->minfo);
    _fmpz_vec_add(Bmaxfields, Bmaxfields, Cmaxfields, ctx->minfo->nfields);

    Abits = _fmpz_vec_max_bits(Bmaxfields, ctx->minfo->nfields);
    Abits = FLINT_MAX(MPOLY_MIN_BITS, Abits + 1);
    Abits = FLINT_MAX(Abits, B->bits);
    Abits = FLINT_MAX(Abits, C->bits);
    Abits = mpoly_fix_bits(Abits, ctx->minfo);

    for (i = 0; i < ctx->minfo->nfields; i++)
    {
        fmpz_clear(Bmaxfields + i);
        fmpz_clear(Cmaxfields + i);
    }

    N = mpoly_words_per_exp(Abits, ctx->minfo);
    cmpmask = (ulong *) TMP_ALLOC(N*sizeof(ulong));
    mpoly_get_cmpmask(cmpmask, N, Abits, ctx->minfo);

    /* ensure input exponents are packed into same sized fields as output */
    if (Abits != B->bits)
    {
        freeBexps = 1;
        Bexps = (ulong *) flint_malloc(N*B->length*sizeof(ulong));
        mpoly_repack_monomials(Bexps, Abits, B->exps, B->bits, B->length, ctx->minfo);
    }

    if (Abits != C->bits)
    {
        freeCexps = 1;
        Cexps = (ulong *) flint_malloc(N*C->length*sizeof(ulong));
        mpoly_repack_monomials(Cexps, Abits, C->exps, C->bits, C->length, ctx->minfo);
    }

    if (A == B || A == C)
    {
        new_fq_nmod_mpoly_init(T, ctx);
        P = T;
    }
    else
    {
        P = A;
    }

    new_fq_nmod_mpoly_fit_length_set_bits(P, B->length + C->length, Abits, ctx);

    if (B->length > C->length)
    {
        _new_fq_nmod_mpoly_mul_johnson(P, C->coeffs, Cexps, C->length,
                   B->coeffs, Bexps, B->length, Abits, N, cmpmask, ctx->fqctx);
    }
    else
    {
        _new_fq_nmod_mpoly_mul_johnson(P, B->coeffs, Bexps, B->length,
                   C->coeffs, Cexps, C->length, Abits, N, cmpmask, ctx->fqctx);
    }

    if (A == B || A == C)
    {
        new_fq_nmod_mpoly_swap(A, T, ctx);
        new_fq_nmod_mpoly_clear(T, ctx);
    }

    if (freeBexps)
        flint_free(Bexps);

    if (freeCexps)
        flint_free(Cexps);

    TMP_END;
}


void fq_nmod_mpoly_mul(fq_nmod_mpoly_t A, const fq_nmod_mpoly_t B,
                        const fq_nmod_mpoly_t C, const fq_nmod_mpoly_ctx_t ctx)
{
    new_fq_nmod_mpoly_t a, b, c, d;
ulong t1, t2;

    new_fq_nmod_mpoly_init(a, ctx);
    new_fq_nmod_mpoly_init(b, ctx);
    new_fq_nmod_mpoly_init(c, ctx);
    new_fq_nmod_mpoly_init(d, ctx);

    new_fq_nmod_mpoly_set_fq_nmod_mpoly(a, A, ctx);
    new_fq_nmod_mpoly_set_fq_nmod_mpoly(b, B, ctx);
    new_fq_nmod_mpoly_set_fq_nmod_mpoly(c, C, ctx);

t1 = GetTickCount64();
    new_fq_nmod_mpoly_mul_johnson(a, b, c, ctx);
t1 = GetTickCount64() - t1;

t2 = GetTickCount64();
    fq_nmod_mpoly_mul_johnson(A, B, C, ctx);
t2 = GetTickCount64() - t2;

flint_printf("new: %wu  old: %wu  ratio: %f\n", t1, t2, (double) t2 / (double) t1);

    new_fq_nmod_mpoly_set_fq_nmod_mpoly(d, A, ctx);

    if (!new_fq_nmod_mpoly_equal(a, d, ctx))
    {
        flint_printf("oops\n");
        flint_abort();
    }

    new_fq_nmod_mpoly_clear(a, ctx);
    new_fq_nmod_mpoly_clear(b, ctx);
    new_fq_nmod_mpoly_clear(c, ctx);
    new_fq_nmod_mpoly_clear(d, ctx);
}