polkovnikov/stackoverflow_04301434_ecm.cpp Secret

## stackoverflow_04301434_ecm.cpp
#include <cstdint>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <iomanip>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <sstream>
#include <chrono>
#include <atomic>
#include <set>
#include <algorithm>
#include <vector>
#include <mutex>
#include <future>
#include <thread>

#include <boost/multiprecision/cpp_int.hpp>
#define SUPPORT_GMP 1
#if SUPPORT_GMP
    #include <gmpxx.h>
#endif

#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#define COUT(code) { std::unique_lock<std::mutex> lock(cout_mux); std::cout code; std::cout << std::flush; }
#define LN { COUT(<< "LN " << __LINE__ << std::endl); }
#define DUMP(x) { COUT(<< " " << (#x) << " = " << (x) << std::endl); }
#define bisizeof(x) (sizeof(x) * 8)
#define BiSize(T) (BiSizeS<T>::value)

using u8  = uint8_t;
using u32 = uint32_t;
using i32 = int32_t;
using u64 = uint64_t;
using i64 = int64_t;

using u128 = boost::multiprecision::uint128_t;
using i128 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<128, 128, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u192 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<192, 192, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u256 = boost::multiprecision::uint256_t;
using i256 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<256, 256, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u384 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<384, 384, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u512 = boost::multiprecision::uint512_t;
using i512 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<512, 512, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u768 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<768, 768, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u1024 = boost::multiprecision::uint1024_t;
using i1024 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<1024, 1024, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u1536 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<1536, 1536, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u2048 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<2048, 2048, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using i2048 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<2048, 2048, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u3072 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<3072, 3072, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u4096 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<4096, 4096, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using i4096 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<4096, 4096, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u6144 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<6144, 6144, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u8192 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<8192, 8192, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using i8192 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<8192, 8192, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u12288 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<12288, 12288, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using u16384 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<16384, 16384, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;
using i16384 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<16384, 16384, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>;
using u24576 = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<24576, 24576, boost::multiprecision::unsigned_magnitude, boost::multiprecision::unchecked, void>>;

template <typename T> struct SWordOf;
template <> struct SWordOf<u32> { using type = i32; };
template <> struct SWordOf<u64> { using type = i64; };
template <> struct SWordOf<u128> { using type = i128; };
template <> struct SWordOf<u256> { using type = i256; };
template <> struct SWordOf<u512> { using type = i512; };
template <> struct SWordOf<u1024> { using type = i1024; };
template <> struct SWordOf<u2048> { using type = i2048; };
template <> struct SWordOf<u4096> { using type = i4096; };
template <> struct SWordOf<u8192> { using type = i8192; };
template <> struct SWordOf<u16384> { using type = i16384; };
#if SUPPORT_GMP
    template <> struct SWordOf<mpz_class> { using type = mpz_class; };
#endif

template <typename T> struct DWordOf;
template <> struct DWordOf<u32> { using type = u64; };
template <> struct DWordOf<u64> { using type = u128; };
template <> struct DWordOf<u128> { using type = u256; };
template <> struct DWordOf<u256> { using type = u512; };
template <> struct DWordOf<u512> { using type = u1024; };
template <> struct DWordOf<u1024> { using type = u2048; };
template <> struct DWordOf<u2048> { using type = u4096; };
template <> struct DWordOf<u4096> { using type = u8192; };
template <> struct DWordOf<u8192> { using type = u16384; };
#if SUPPORT_GMP
    template <> struct DWordOf<mpz_class> { using type = mpz_class; };
#endif

template <typename T> struct TWordOf;
template <> struct TWordOf<u64> { using type = u192; };
template <> struct TWordOf<u128> { using type = u384; };
template <> struct TWordOf<u256> { using type = u768; };
template <> struct TWordOf<u512> { using type = u1536; };
template <> struct TWordOf<u1024> { using type = u3072; };
template <> struct TWordOf<u2048> { using type = u6144; };
template <> struct TWordOf<u4096> { using type = u12288; };
template <> struct TWordOf<u8192> { using type = u24576; };
#if SUPPORT_GMP
    template <> struct TWordOf<mpz_class> { using type = mpz_class; };
#endif

template <typename T> struct QWordOf;
template <> struct QWordOf<u32> { using type = u128; };
template <> struct QWordOf<u64> { using type = u256; };
template <> struct QWordOf<u128> { using type = u512; };
template <> struct QWordOf<u256> { using type = u1024; };
template <> struct QWordOf<u512> { using type = u2048; };
template <> struct QWordOf<u1024> { using type = u4096; };
template <> struct QWordOf<u2048> { using type = u8192; };
template <> struct QWordOf<u4096> { using type = u16384; };
#if SUPPORT_GMP
    template <> struct QWordOf<mpz_class> { using type = mpz_class; };
#endif

template <typename T> struct BiSizeS; //: std::integral_constant<size_t, bisizeof(T)> {};
template <> struct BiSizeS<u32> : std::integral_constant<size_t, 32> {};
template <> struct BiSizeS<u64> : std::integral_constant<size_t, 64> {};
template <> struct BiSizeS<u128> : std::integral_constant<size_t, 128> {};
template <> struct BiSizeS<u256> : std::integral_constant<size_t, 256> {};
template <> struct BiSizeS<u512> : std::integral_constant<size_t, 512> {};
template <> struct BiSizeS<u1024> : std::integral_constant<size_t, 1024> {};
template <> struct BiSizeS<u2048> : std::integral_constant<size_t, 2048> {};
template <> struct BiSizeS<u4096> : std::integral_constant<size_t, 4096> {};
template <> struct BiSizeS<u8192> : std::integral_constant<size_t, 8192> {};
template <> struct BiSizeS<u16384> : std::integral_constant<size_t, 16384> {};

std::mutex cout_mux;

class AtomicMutex {
public:
    auto Locker() { return std::lock_guard<AtomicMutex>(*this); }
    auto & Flag() { return f_; }

    void lock() { while (f_.test_and_set(std::memory_order_acquire)) {} }
    void unlock() { f_.clear(std::memory_order_release); }
private:
    std::atomic_flag f_ = ATOMIC_FLAG_INIT;
};

template <typename T>
u32 ToU32(T const & x) {
    #if SUPPORT_GMP
        if constexpr(std::is_same_v<std::decay_t<T>, mpz_class>)
            return x.get_ui();
        else
    #endif
        return u32(x);
}

template <typename T>
double ToDouble(T const & x) {
    #if SUPPORT_GMP
        if constexpr(std::is_same_v<std::decay_t<T>, mpz_class>)
            return x.get_d();
        else
    #endif
        return double(x);
}

template <typename T>
std::string IntToStr(T n) {
    if (n == 0)
        return "0";
    std::string r;
    while (n != 0) {
        u32 constexpr mod = 1'000'000'000U;
        std::ostringstream ss;
        ss << ToU32<T>(n % mod);
        r = ss.str() + r;
        n /= mod;
    }
    return r;
}

static auto & RNG() {
    thread_local std::mt19937_64 rng{123}; //{(u64(std::random_device{}()) << 32) ^ u64(std::random_device{}())};
    return rng;
}

template <typename T>
static T RandBits(size_t bisize) {
    T r = 0;
    for (size_t i = 0; i < bisize; i += 32) {
        size_t const cbits = std::min<size_t>(32, bisize - i);
        r <<= cbits;
        r ^= u32(RNG()()) >> (32 - cbits);
    }
    return r;
}

template <typename T>
static size_t NumBits(T n) {
    size_t cnt = 0;
    while (n > 0xFF) {
        n >>= 8;
        cnt += 8;
    }
    while (n != 0) {
        n >>= 1;
        ++cnt;
    }
    return cnt;
}

template <typename T>
static T RandRange(T end) {
    if (end <= 1)
        return 0;
    size_t const bits = NumBits<T>(end - 1);
    while (true) {
        auto const n = RandBits<T>(bits);
        if (n < end)
            return n;
    }
}

template <typename T>
struct UniformIntDistr {
    UniformIntDistr(T a, T b) : a_(a), diff_(b + 1 - a) { ASSERT(a <= b); }
    template <typename R>
    T operator()(R & r) const { return a_ + RandRange<T>(diff_); }
    T a_ = 0, diff_ = 0;
};

template <typename T>
static T PowMod(T a, T b, T const & c) {
    // https://en.wikipedia.org/wiki/Modular_exponentiation
    using DT = typename DWordOf<T>::type;
    T r = 1;
    while (b != 0) {
        if (ToU32<T>(b & 1))
            r = T((DT(r) * a) % c);
        a = T((DT(a) * a) % c);
        b >>= 1;
    }
    return r;
}

static double Time() {
    static auto const gtb = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::duration<double>>(
        std::chrono::high_resolution_clock::now() - gtb).count();
}

template <typename Word>
class ECPoint {
public:
    using SWord = typename SWordOf<Word>::type;
    using DWord = typename DWordOf<Word>::type;
    using TWord = typename TWordOf<Word>::type;

    static bool constexpr use_barrett = false, is_mpz = std::is_same_v<Word, mpz_class>;

    class InvError : public std::runtime_error {
    public:
        InvError(Word const & _gcd, Word const & _x, Word const & _mod)
            : std::runtime_error("(gcd " + IntToStr(_gcd) + ", x " + IntToStr(_x) +
                ", mod " + IntToStr(_mod) + ")"), gcd(_gcd), x(_x), mod(_mod) {}
        Word gcd = 0, x = 0, mod = 0;
    };

    template <typename T>
    static T Rand() {
        T r = 0;
        for (size_t i = 0; i < BiSize(T); i += 64)
            r ^= T(RNG()()) << i;
        return r;
    }

    template <typename T>
    static T RandUni(T a, T b) {
        return UniformIntDistr<T>(a, b)(RNG());
    }

    static Word rand_range(Word const & begin, Word const & end) {
        ASSERT(begin < end);
        return UniformIntDistr<Word>(begin, end - 1)(RNG());
    }

    static bool fermat_prp(Word const & n, size_t trials = 32) {
        // https://en.wikipedia.org/wiki/Fermat_primality_test
        if (n <= 16)
            return n == 2 || n == 3 || n == 5 || n == 7 || n == 11 || n == 13;
        for (size_t i = 0; i < trials; ++i)
            if (PowMod<Word>(rand_range(2, n - 2), n - 1, n) != 1)
                return false;
        return true;
    }

    static Word rand_prime_range(Word begin, Word end) {
        while (true) {
            Word const p = rand_range(begin, end) | 1;
            if (fermat_prp(p))
                return p;
        }
    }

    static Word rand_prime(size_t bits) {
        return rand_prime_range(Word(1) << (bits - 1), Word((DWord(1) << bits) - 1));
    }

    std::tuple<Word, size_t> BarrettRS(Word n) {
        size_t constexpr extra = 3;
        for (size_t k = 0; k < BiSize(DWord); ++k) {
            if (2 * (k + extra) < BiSize(Word))
                continue;
            if ((DWord(1) << k) <= DWord(n))
                continue;
            k += extra;
            ASSERT_MSG(2 * k < BiSize(DWord), "k " + std::to_string(k) + " BiSize(DWord) " + std::to_string(BiSize(DWord)));
            DWord r = (DWord(1) << (2 * k)) / n;
            ASSERT_MSG(DWord(r) < (DWord(1) << BiSize(Word)),
                "k " + std::to_string(k) + " n " + IntToStr(n));
            ASSERT(2 * k >= BiSize(Word));
            return std::make_tuple(Word(r), size_t(2 * k - BiSize(Word)));
        }
        ASSERT(false);
    }

    template <bool Adjust>
    static Word BarrettMod(DWord const & x, Word const & n, Word const & r, size_t s) {
        if constexpr(!use_barrett)
            return Word(x % n);
        else {
            DWord const q = DWord(((TWord(x) * r) >> BiSize(Word)) >> s);
            Word t = Word(DWord(x) - q * n);
            if constexpr(Adjust) {
                Word const mask = ~Word(SWord(t - n) >> (BiSize(Word) - 1));
                t -= mask & n;
            }
            return t;
        }
    }

    static Word Adjust(Word const & a, Word const & n) {
        return a >= n ? a - n : a;
    }

    Word modNn(DWord const & a) const { return BarrettMod<false>(a, N_, N_br_, N_bs_); }
    Word modNa(DWord const & a) const { return BarrettMod<true>(a, N_, N_br_, N_bs_); }
    Word modQn(DWord const & a) const { return BarrettMod<false>(a, q_, Q_br_, Q_bs_); }
    Word modQa(DWord const & a) const { return BarrettMod<true>(a, q_, Q_br_, Q_bs_); }

    static Word mod(DWord const & a, Word const & n) { return Word(a % n); }

    static ECPoint base_gen(size_t bits = size_t(-1), Word N0 = 0, bool calc_q = false, Word min_order_pfactor = 0) {
        ASSERT(bits != size_t(-1) || N0 != 0);
        while (true) {
            Word N = 0;
            if (N0 != 0)
                N = N0;
            else
                for (; mod(N, 4) != 3; N = rand_prime(bits));
            Word const A = rand_range(1, N);
            Word x0 = 0, y0 = 0, B = 0;
            while (true) {
                x0 = rand_range(1, N);
                y0 = rand_range(1, N);
                B = mod(mod(DWord(y0) * y0, N) + N * 2 - mod(DWord(mod(DWord(x0) * x0, N)) * x0, N) - mod(DWord(A) * x0, N), N);
                if (N0 == 0) {
                    Word const
                        right = mod(mod(DWord(mod(DWord(x0) * x0, N)) * x0, N) + mod(DWord(A) * x0, N) + B, N),
                        y0_calc = PowMod<Word>(right, (N + 1) >> 2, N);
                    if (mod(DWord(y0_calc) * y0_calc, N) != right)
                        continue;
                    ASSERT_MSG(y0 == y0_calc || y0 == N - y0_calc, "x0 " + IntToStr(x0) + " y0 " + IntToStr(y0) + " y0_calc " +
                        IntToStr(y0_calc) + " A " + IntToStr(A) + " B " + IntToStr(B) + " N " + IntToStr(N));
                }
                break;
            }
            if (mod(mod(DWord(y0) * y0, N) + 3 * N - mod(DWord(mod(DWord(x0) * x0, N)) * x0, N) - mod(DWord(A) * x0, N) - B, N) != 0)
                continue;
            auto const bp = ECPoint(A, B, N, x0, y0, 0, true, calc_q);

            if (calc_q) {
                auto BpCheckOrder = [&]{
                    for (auto e: bp.q_ps())
                        if (e < min_order_pfactor)
                            return false;
                    return true;
                };

                if (!(bp.q() != 0 && !bp.q_ps().empty() && BpCheckOrder()))
                    continue;
                ASSERT(bp.q() > 1 && bp * (bp.q() + 1) == bp);
            }
            return bp;
        }
        ASSERT(false);
    }

    ECPoint(Word A, Word B, Word N, Word x, Word y, Word q = 0, bool prepare = true, bool calc_q = false) {
        if (prepare) {
            A = mod(A, N); B = mod(B, N); x = mod(x, N); y = mod(y, N); q = mod(q, N);
            ASSERT(mod(4 * mod(DWord(mod(DWord(A) * A, N)) * A, N) + 27 * mod(DWord(B) * B, N), N) != 0);
            if constexpr(0) {
                ASSERT(mod(N, 4) == 3);
                if (!(x == 0 && y == 0)) {
                    ASSERT(mod(mod(DWord(y) * y, N) + 3 * N - mod(DWord(mod(DWord(x) * x, N)) * x, N) - mod(DWord(A) * x, N) - B, N) == 0);
                    auto root = PowMod<Word>(mod(DWord(mod(DWord(x) * x, N)) * x, N) + mod(DWord(A) * x, N) + B, (N + 1) >> 2, N);
                    ASSERT(y == root || y == N - root);
                }
            }
            if constexpr(use_barrett) {
                std::tie(N_br_, N_bs_) = BarrettRS(N);
                if (q != 0)
                    std::tie(Q_br_, Q_bs_) = BarrettRS(q);
            }
        }
        std::tie(A_, B_, N_, x_, y_, q_) = std::tie(A, B, N, x, y, q);
        if (calc_q) {
            std::tie(q_, q_ps_) = find_order();
            if (q_ != 0) {
                if constexpr(use_barrett)
                    std::tie(Q_br_, Q_bs_) = BarrettRS(q_);
            }
        }
    }

    auto copy() const {
        return ECPoint(A_, B_, N_, x_, y_, q_, false);
    }

    auto inf() const {
        return ECPoint(A_, B_, N_, 0, 0, q_, false);
    }

    static auto const & gen_primes(u64 const B) {
        using T = u64;
        thread_local std::vector<T> ps = {2, 3};

        for (T p = ps.back() + 2; p <= B; p += 2) {
            bool is_prime = true;
            for (auto const e: ps) {
                if (e * e > p)
                    break;
                if (p % e == 0) {
                    is_prime = false;
                    break;
                }
            }
            if (is_prime)
                ps.push_back(p);
        }

        return ps;
    }

    static std::vector<Word> FactorTrialDiv(Word n) {
        std::vector<Word> fs;
        Word i = 0, lim = 1000;
        while (n != 1) {
            auto const & ps = gen_primes(lim);
            for (size_t j = i;; ++j) {
                if (j >= ps.size()) {
                    i = j;
                    lim = ps.back() + 1000;
                    break;
                }
                auto const p = ps[j];
                while (n % p == 0) {
                    fs.push_back(p);
                    n /= p;
                }
                if (n == 1)
                    break;
            }
        }
        return fs;
    }

    std::tuple<Word, std::vector<Word>> find_order(Word _m = 1, std::vector<Word> _ps = {}) const {
        ASSERT(_m <= 2 * N_);
        bool const first = _ps.empty();
        double tim = 0;
        if (first)
            tim = Time();
        u64 last_prime = 0;

        if constexpr(1) {
            auto r = *this;
            try {
                r *= _m;
            } catch (InvError const &) {
                return std::make_tuple(_m, _ps);
            }
            Word const B = 2 * N_;
            u64 const limit = std::llround(std::pow(ToDouble<Word>(B), 1 / 2.5));
            auto const primes = gen_primes(limit + 1);
            last_prime = primes.back();
            for (Word const p: primes) {
                if (p > limit)
                    break;
                ASSERT(p <= B);
                size_t cnt = 0;
                Word hi = 1;
                try {
                    for (cnt = 1;; ++cnt) {
                        if (hi * p > B) {
                            cnt -= 1;
                            break;
                        }
                        hi *= p;
                        r *= p;
                    }
                } catch (InvError const & ex) {
                    _ps.insert(_ps.begin(), cnt, p);
                    auto const res = find_order(hi * _m, _ps);
                    if (first)
                        std::cout << "FindOrderTime " << std::fixed << std::setprecision(3) << (Time() - tim) << ", last_prime " << last_prime << std::endl;
                    return res;
                }
            }
        } else {
            // Alternative slower way
            auto r = *this;
            for (Word i = 0;; ++i)
                try {
                    r += *this;
                } catch (InvError const &) {
                    _ps.clear();
                    return std::make_tuple(i + 2, _ps);
                }
        }

        if (first)
            std::cout << "FindOrderTime " << std::fixed << std::setprecision(3) << (Time() - tim) << ", last_prime " << last_prime << std::endl;
        _ps.clear();
        return std::make_tuple(Word(0), _ps);
    }

    static std::tuple<Word, SWord, SWord> EGCD(Word const & a, Word const & b) {
        Word ro = 0, r = 0, qu = 0, re = 0;
        SWord so = 0, s = 0;
        std::tie(ro, r, so, s) = std::make_tuple(a, b, 1, 0);
        while (r != 0) {
            std::tie(qu, re) = std::make_tuple(ro / r, ro % r);
            std::tie(ro, r) = std::make_tuple(r, re);
            std::tie(so, s) = std::make_tuple(s, so - s * SWord(qu));
        }
        SWord const to = (SWord(ro) - SWord(a) * so) / SWord(b);
        return std::make_tuple(ro, so, to);
    }

    Word inv(Word a, Word const & n, size_t any_n_q = 0) const {
        #if SUPPORT_GMP
            if constexpr(is_mpz) {
                Word r;
                if (mpz_invert(r.get_mpz_t(), a.get_mpz_t(), n.get_mpz_t()) == 0) {
                    mpz_gcd(r.get_mpz_t(), a.get_mpz_t(), n.get_mpz_t());
                    throw InvError(r, a, n);
                }
                return r;
            }
        #endif
        ASSERT(n > 0);
        a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
        auto [gcd, s, t] = EGCD(a, n);
        if (gcd != 1)
            throw InvError(gcd, a, n);
        a = Word(SWord(n) + s);
        a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
        return a;
    }

    Word invN(Word a) const { return inv(a, N_, 1); }
    Word invQ(Word a) const { return inv(a, q_, 2); }

    ECPoint & operator += (ECPoint const & o) {
        if (x_ == 0 && y_ == 0) {
            *this = o;
            return *this;
        }
        if (o.x_ == 0 && o.y_ == 0)
            return *this;
        Word const Px = x_, Py = y_, Qx = o.x_, Qy = o.y_;
        Word s = 0;
        if ((Adjust(Px, N_) == Adjust(Qx, o.N_)) && (Adjust(Py, N_) == Adjust(Qy, o.N_)))
            s = modNn(DWord(modNn(DWord(Px) * Px * 3) + A_) * invN(Py * 2));
        else
            s = modNn(DWord(Py + 2 * N_ - Qy) * invN(Px + 2 * N_ - Qx));
        x_ = modNn(DWord(s) * s + 4 * N_ - Px - Qx);
        y_ = modNn(DWord(s) * (Px + 2 * N_ - x_) + 2 * N_ - Py);
        return *this;
    }

    ECPoint operator + (ECPoint const & o) const {
        ECPoint c = *this;
        c += o;
        return c;
    }

    template <typename TK>
    ECPoint & operator *= (TK k) {
        auto const ok = k;
        ASSERT(k > 0);
        if (k == 1)
            return *this;
        k -= 1;
        auto r = *this, s = *this;
        for (size_t icycle = 0;; ++icycle) {
            if (ToU32<TK>(k & 1)) {
                r += s;
                if (k == 1)
                    break;
            }
            k >>= 1;
            s += s;
        }
        if constexpr(0) {
            auto r2 = *this;
            for (u64 i = 1; i < ok; ++i)
                r2 += *this;
            ASSERT(r == r2);
        }
        *this = r;
        return *this;
    }

    ECPoint operator * (Word k) const {
        ECPoint r = *this;
        r *= k;
        return r;
    }

    bool operator == (ECPoint const & o) const {
        return A_ == o.A_ && B_ == o.B_ && N_ == o.N_ && q_ == o.q_ &&
            Adjust(x_, N_) == Adjust(o.x_, o.N_) && Adjust(y_, N_) == Adjust(o.y_, o.N_);
    }

    Word const & q() const { return q_; }
    std::vector<Word> const & q_ps() const { return q_ps_; }
    Word const & x() const { return x_; }

    std::string ToStr() const {
        return "X " + IntToStr(x_) + " Y " + IntToStr(y_) + " A " + IntToStr(A_) + " B " + IntToStr(B_) + " N " + IntToStr(N_) + " q " + IntToStr(q_);
    }

private:
    Word A_ = 0, B_ = 0, N_ = 0, q_ = 0, x_ = 0, y_ = 0, N_br_ = 0, Q_br_ = 0;
    size_t N_bs_ = 0, Q_bs_ = 0;
    std::vector<Word> q_ps_;
};

template <typename T = u32>
T GetPrime(size_t pos) {
    static std::vector<T> primes = {2, 3};
    static AtomicMutex mux;
    if (!mux.Flag().test() && pos < primes.size())
        return primes[pos];
    auto lock = mux.Locker();
    while (pos >= primes.size())
        for (T p = primes.back() + 2;; p += 2) {
            bool is_prime = true;
            for (auto d: primes) {
                if (d * d > p)
                    break;
                if (p % d == 0) {
                    is_prime = false;
                    break;
                }
            }
            if (is_prime) {
                primes.push_back(p);
                break;
            }
        }
    return primes[pos];
}

std::pair<double, double> GetPrimeLog2(size_t pos) {
    static std::vector<std::pair<double, double>> plogs;
    static AtomicMutex mux;
    if (!mux.Flag().test() && pos < plogs.size())
        return plogs[pos];
    auto lock = mux.Locker();
    while (pos >= plogs.size()) {
        auto const log2 = std::log2(GetPrime(pos));
        plogs.push_back({log2, plogs.empty() ? log2 : plogs.back().second + log2});
    }
    return plogs[pos];
}

template <typename T>
std::pair<bool, bool> IsProbablyPrime_TrialDiv(T const n, u64 limit = u64(-1)) {
    // https://en.wikipedia.org/wiki/Trial_division
    if (n <= 10)
        return {n == 2 || n == 3 || n == 5 || n == 7, true};
    if ((n & 1) == 0)
        return {false, true};
    u64 d = 0;
    for (d = 3; d < limit && d * d <= n; d += 2)
        if (n % d == 0)
            return {false, true};
    return {true, d * d > n};
}

template <typename T>
bool IsProbablyPrime_Fermat(T const n, size_t ntrials = 32) {
    // https://en.wikipedia.org/wiki/Fermat_primality_test
    for (size_t trial = 0; trial < ntrials; ++trial)
        if (PowMod<T>(RandRange<T>(n - 3) + 2, n - 1, n) != 1)
            return false;
    return true;
}

template <typename T>
bool IsProbablyPrime(T const n) {
    if (n < (1 << 12))
        return IsProbablyPrime_TrialDiv(n).first;
    return IsProbablyPrime_Fermat(n);
}

template <typename T>
std::pair<std::vector<T>, std::vector<T>> Factor_TrialDiv_Odd(T n, u64 limit = u64(-1)) {
    // https://en.wikipedia.org/wiki/Trial_division
    thread_local std::vector<T> fs0;
    auto & fs = fs0;
    fs.clear();
    while ((n & 1) == 0) {
        n >>= 1;
        fs.push_back(2);
    }
    T d = 0;
    for (d = 3; d < limit && d * d <= n; d += 2)
        while (n % d == 0) {
            n /= d;
            fs.push_back(d);
        }
    if (n == 1)
        return {fs, {}};
    if (d * d > n) {
        fs.push_back(n);
        return {fs, {}};
    } else
        return {fs, {n}};
}

template <typename To, typename From>
To ToT(From x) {
    To y = 0;
    for (size_t i = 0; x != 0; ++i) {
        u32 w = ToU32<From>(x);
        x >>= 32;
        if ((i + 1) * 32 <= BiSize(To))
            y |= To(w) << (i * 32);
    }
    return y;
}

template <typename T>
std::pair<std::vector<T>, std::vector<T>> Factor_TrialDiv_Primes(T n, u64 limit = u64(-1)) {
    // https://en.wikipedia.org/wiki/Trial_division
    thread_local std::vector<T> fs0;
    auto & fs = fs0;
    fs.clear();
    while ((n & 1) == 0) {
        n >>= 1;
        fs.push_back(2);
    }
    u64 const n_limit = (n >> 64) == 0 ? ToT<u64, T>(n) : (u64(-1) >> 1);
    u32 p = 0;
    for (size_t i = 1;; ++i) {
        p = GetPrime(i);
        if (p >= limit || u64(p) * p > n_limit)
            break;
        while (n % p == 0) {
            n /= p;
            fs.push_back(p);
        }
    }
    if (n == 1)
        return {fs, {}};
    if (u64(p) * p > n_limit) {
        fs.push_back(n);
        return {fs, {}};
    } else
        return {fs, {n}};
}

template <typename T>
T GCD(T a, T b) {
    while (b != 0) {
        std::swap(a, b);
        b %= a;
    }
    return a;
}

std::string FloatToStr(double x, int prec = 7) {
    std::ostringstream ss;
    ss << std::fixed << std::setprecision(prec) << x;
    return ss.str();
}

template <typename T>
std::pair<std::vector<T>, std::vector<T>> Factor_PollardRho(T n, u64 limit = u64(-1), size_t ntrials = 6) {
    // https://en.wikipedia.org/wiki/Pollard%27s_rho_algorithm
    using DT = typename DWordOf<T>::type;
    if (n <= 1)
        return {{}, {}};
    if (IsProbablyPrime<T>(n))
        return {{n}, {}};
    auto f = [&n](T x){ return T((DT(x) * x + 1) % n); };
    ASSERT(3 <= n);
    size_t cnt = 0;
    UniformIntDistr<T> distr(1, n - 2);
    for (size_t itry = 0; itry < ntrials; ++itry) {
        bool failed = false;
        T x = distr(RNG());
        u64 sum_cycles = 0;
        for (u64 cycle = 1;; cycle <<= 1) {
            T y = x, m = 1, xstart = x;
            for (u64 i = 0, istart = 0; i < cycle; ++i) {
                x = f(x);
                m = T((DT(m) * (n + x - y)) % n);
                ++cnt;
                if (cnt >= limit)
                    return {{}, {n}};
                if (!(i + 1 - istart >= (1 << 7) || i + 1 >= cycle))
                    continue;
                if (GCD<T>(n, m) == 1) {
                    istart = i + 1;
                    xstart = x;
                    continue;
                }
                T x2 = xstart;
                for (u64 i2 = istart; i2 <= i; ++i2) {
                    x2 = f(x2);
                    auto const g = GCD<T>(n, n + x2 - y);
                    if (g == 1)
                        continue;
                    sum_cycles += i + 1;
                    {
                        auto const threshold = sum_cycles >= 100 ? 0.5 : 0.6;
                        ASSERT_MSG(std::log2(sum_cycles) < std::log2(ToDouble<T>(n)) * threshold, "n " + IntToStr(n) +
                            " (log2 " + std::to_string(std::log2(ToDouble<T>(n))) +
                            ") cycles " + IntToStr(sum_cycles) + " (log2 " + std::to_string(std::log2(sum_cycles)) + ")");
                    }
                    if (g == n) {
                        failed = true;
                        break;
                    }
                    auto res0 = Factor_PollardRho<T>(g, limit, ntrials);
                    auto res1 = Factor_PollardRho<T>(n / g, limit, ntrials);
                    res0.first.insert(res0.first.end(), res1.first.begin(), res1.first.end());
                    res0.second.insert(res0.second.end(), res1.second.begin(), res1.second.end());
                    return std::move(res0);
                }
                if (failed)
                    break;
                ASSERT(false);
            }
            sum_cycles += cycle;
            if (failed)
                break;
        }
    }
    return {{}, {n}};
}

template <typename T>
std::pair<std::vector<T>, std::vector<T>> Factor_ECM(T const n, u64 limit = u64(-1), bool verbose = false) {
    if (n <= 1)
        return {{}, {}};
    if (IsProbablyPrime(n))
        return {{n}, {}};
    using ECT = ECPoint<T>;
    struct Entry {
        ECT curve;
        size_t done_primes = 0;
        bool active = true;
    };
    size_t const bits = NumBits(n), nthr = std::thread::hardware_concurrency();
    std::vector<Entry> curves;

    size_t T_bisize = 0;
    #if SUPPORT_GMP
        if constexpr(std::is_same_v<T, mpz_class>)
            T_bisize = (bits + 63) / 64 * 64;
        else
    #endif
            T_bisize = BiSize(T);

    T n_factored = n;
    bool n_factored_changed = true;
    std::set<T> factors;
    std::mutex mux;
    std::atomic<double> done_ops = 0;
    std::vector<std::future<void>> asyncs;
    size_t num_primes = 0;
    double tim = Time();

    for (size_t iiter = 0;; ++iiter) {
        if (n_factored_changed && IsProbablyPrime(n_factored)) {
            factors.insert(n_factored);
            n_factored = 1;
        }
        if (n_factored == 1)
            break;

        tim = Time();

        size_t const extra_primes = iiter == 0 ? 1 << 9 : [&]{
            auto Prob = [&](size_t nprimes){
                double x = double(bits) / GetPrimeLog2(nprimes).first;
                return std::pow(x, -x);
            };
            auto Prob2 = [&](size_t ncurves, size_t nprimes){
                return 1.0 - std::pow(1.0 - Prob(nprimes), ncurves);
            };
            auto Prob2G = [&](size_t ncurves, size_t nprimes){
                #if SUPPORT_GMP
                    double x = double(bits) / GetPrimeLog2(nprimes).first;
                    mpf_class f;
                    f.set_prec(1 << 15);
                    f = x;
                    for (size_t i = 0; i < 4; ++i)
                        mpf_sqrt(f.get_mpf_t(), f.get_mpf_t());
                    mpf_pow_ui(f.get_mpf_t(), f.get_mpf_t(), u32(std::llround(x * 16.0)));
                    mpf_ui_div(f.get_mpf_t(), 1, f.get_mpf_t());
                    mpf_ui_sub(f.get_mpf_t(), 1, f.get_mpf_t());
                    mpf_pow_ui(f.get_mpf_t(), f.get_mpf_t(), u32(ncurves));
                    mpf_ui_sub(f.get_mpf_t(), 1, f.get_mpf_t());
                    return f;
                #else
                    return Prob2(ncurves, nprimes);
                #endif
            };
            auto Work = [&](size_t ncurves, size_t nprimes){
                return GetPrimeLog2(nprimes).second * ncurves;
            };
            size_t maxi = 0, i = 0;
            double w0 = Work(curves.size(), num_primes);
            auto p0 = Prob2G(curves.size(), num_primes);
            using P2GT = std::decay_t<decltype(p0)>;
            P2GT maxv = 0;

            for (i = 8; i <= (1 << 9); i += 8) {
                size_t constexpr dc = 1;
                double w1 = Work(curves.size() + dc, num_primes + i), wd = w1 - w0;
                P2GT p1 = Prob2G(curves.size() + dc, num_primes + i), pd = p1 - p0;
                /*
                ASSERT_MSG(w0 < w1 && p0 <= p1,
                    "w0 " + FloatToStr(w0, 1) + " w1 " + FloatToStr(w1, 1) +
                    " p0 " + FloatToStr(p0, 6) + " p1 " + FloatToStr(p1, 6) + " (" +
                    std::to_string(p1) + ") i " + std::to_string(i));
                */
                P2GT k = pd / wd;
                ASSERT(k >= 0);
                if (k >= maxv) {
                    maxv = k;
                    maxi = i;
                }
                if (maxi != 0 && maxi + 20 < i/* && maxv / k > 1.05*/)
                    break;
            }
            //ASSERT(i <= (1 << 12));
            return maxi == 0 ? 1 << 5 : maxi;
        }();

        size_t const B_bits = std::min<size_t>(T_bisize - 1, 32), tot_bisize = T_bisize * 4,
            task_bits = std::min<size_t>(tot_bisize - 8, 1 << 9);
        ASSERT(B_bits < T_bisize);
        for (size_t i = 0; i < (iiter == 0 ? 1 : 1); ++i)
            curves.push_back(Entry{.curve = ECT::base_gen(size_t(-1), n)});
        num_primes += extra_primes;

        for (size_t ithr = 0; ithr < nthr; ++ithr)
            asyncs.push_back(std::async(std::launch::async, [&, ithr]{
                size_t const block = (curves.size() + nthr - 1) / nthr, begin = ithr * block,
                    end = std::min<size_t>((ithr + 1) * block, curves.size());
                for (size_t icurve = begin; icurve < end; ++icurve) {
                    auto & curve = curves.at(icurve);
                    if (!curve.active)
                        continue;
                    while (curve.done_primes < num_primes) {
                        typename QWordOf<T>::type tot = 1;
                        double tot_lsum = 0;
                        size_t i = 0;
                        for (i = curve.done_primes; i < num_primes; ++i) {
                            auto const p = GetPrime<u64>(i);
                            auto const plog = GetPrimeLog2(i).first;
                            T mp = p;
                            double lsum = plog;
                            while (true) {
                                if (lsum + plog > B_bits)
                                    break;
                                mp *= p;
                                lsum += plog;
                            }
                            tot_lsum += lsum;
                            tot *= mp;
                            if (tot_lsum + B_bits > task_bits) {
                                ++i;
                                break;
                            }
                        }
                        curve.done_primes = i;
                        try {
                            done_ops += tot_lsum;
                            curve.curve *= tot;
                        } catch (typename ECT::InvError const & ex) {
                            ASSERT(ex.gcd > 1 && n % ex.gcd == 0);
                            if (ex.gcd == n)
                                continue;
                            else {
                                std::lock_guard<std::mutex> lock(mux);
                                factors.insert(ex.gcd);
                                while (n_factored % ex.gcd == 0)
                                    n_factored /= ex.gcd;
                                n_factored_changed = true;
                            }
                            curve.active = false;
                            break;
                        }
                    }
                }
            }));

        for (auto & e: asyncs)
            e.get();
        asyncs.clear();

        if (verbose)
            std::cout << "Curves " << std::setw(3) << curves.size() /*<< ", Curve " << curves.back().curve.ToStr()*/
                << ", Ops " << std::fixed << std::setprecision(3) << std::setw(8) << double(done_ops) / 1024
                << " Ki, ExtraPrimes " << std::setw(3) << extra_primes << ", Primes " << std::setw(6) << double(num_primes) / 1024
                << " Ki, IterTime " << std::setw(7) << (Time() - tim) << " sec" << std::endl;

        if (done_ops >= limit)
            break;
    }
    std::pair<std::vector<T>, std::vector<T>> res;
    T nc = n;
    for (auto f: factors) {
        ASSERT(nc % f == 0);
        while (nc % f == 0) {
            res.first.push_back(f);
            nc /= f;
        }
    }
    if (nc != 1)
        res.second.push_back(nc);
    return res;
}

template <typename T>
void OutputFactors(T n, std::pair<std::vector<T>, std::vector<T>> const & fs, bool check = true) {
    auto Sorted = [](auto v){
        std::sort(v.begin(), v.end());
        return v;
    };
    auto Out = [&](auto const & x){
        for (auto f: Sorted(x))
            std::cout << IntToStr(f) << " (2^" << std::fixed << std::setprecision(2)
                << std::log2(ToDouble<T>(f)) << (IsProbablyPrime<T>(f) ? ", prime)" : ", composite)") << ", ";
    };
    if (n != 0)
        std::cout << "Num: " << IntToStr(n) << " (2^" << std::fixed << std::setprecision(2) << std::log2(ToDouble<T>(n)) << ")";
    std::cout << (fs.first.empty() ? "" : "\nFactored: "); Out(fs.first);
    std::cout << (fs.second.empty() ? "" : "\nUnFactored: "); Out(fs.second); std::cout << std::endl;
    if (check) {
        for (auto const & e0: {fs.first, fs.second})
            for (auto const & e1: e0) {
                ASSERT(n % e1 == 0);
                n /= e1;
            }
        ASSERT(n == 1);
    }
}

template <typename T>
std::pair<std::vector<T>, std::vector<T>> Factor(T const n, size_t sec512 = 3, bool verbose = false) {
    using ResT = std::pair<std::vector<T>, std::vector<T>>;
    double tt = Time(), t0 = 0, t1 = 0, t2 = 0;
    ResT r0{}, r1{}, r2{};
    t0 = Time();
    r0 = Factor_TrialDiv_Primes<T>(n, std::llround((1 << 22) * sec512 * 0.1));
    t0 = Time() - t0;
    if (verbose) {
        std::cout << "TrialDiv time " << std::fixed << std::setprecision(3) << t0 << " sec" << std::endl;
        OutputFactors<T>(n, r0, false);
    }
    if (!r0.second.empty()) {
        t1 = Time();
        r1 = Factor_PollardRho<T>(r0.second.front(), std::llround((1 << 17) * sec512 * 0.2));
        t1 = Time() - t1;
        if (verbose) {
            std::cout << "PollardRho time " << t1 << " sec" << std::endl;
            OutputFactors<T>(r0.second.front(), r1, false);
        }
        r0.first.insert(r0.first.end(), r1.first.begin(), r1.first.end());
    }
    r0.second.clear();
    for (auto e: r1.second) {
        t2 = Time();
        r2 = Factor_ECM<T>(e, std::llround((1 << 12) * sec512 * 0.7), true);
        t2 = Time() - t2;
        if (verbose) {
            std::cout << "ECM time " << t2 << " sec" << std::endl;
            OutputFactors<T>(e, r2, false);
        }
        r0.first.insert(r0.first.end(), r2.first.begin(), r2.first.end());
        r0.second.insert(r0.second.end(), r2.second.begin(), r2.second.end());
    }
    tt = Time() - tt;
    if (verbose) {
        std::cout << "Final time " << std::fixed << std::setprecision(3) << tt << " sec" << std::endl;
        OutputFactors<T>(n, r0, true);
    }
    return r0;
}

template <typename T>
static T RandPrimeRange(T begin, T end) {
    ASSERT(begin < end);
    while (true) {
        T const p = (RandRange<T>(end - begin) + begin) | 1;
        if (IsProbablyPrime<T>(p))
            return p;
    }
}

template <typename T>
static T RandPrime(size_t bits) {
    using DT = typename DWordOf<T>::type;
    return RandPrimeRange<T>(T(1) << (bits - 1), T((DT(1) << bits) - 1));
}

int main() {
    try {
        using T = mpz_class;

        for (size_t i = 0; i < 8; ++i)
            Factor<T>(RandBits<T>(2048), 60, true);
        return 0;

        T n = 1;
        for (size_t bits: {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 48, 56, 64, 72})
            n *= RandPrime<T>(bits);
        Factor<T>(n, 20, true);
        return 0;
    } catch (std::exception const & ex) {
        std::cout << "Exception: " << ex.what() << std::endl;
        return -1;
    }
}