Skip to content

Instantly share code, notes, and snippets.

@ykm11
Last active April 8, 2020 09:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ykm11/c71a49ba92eb4e43cf4047d7d8bad6e7 to your computer and use it in GitHub Desktop.
Save ykm11/c71a49ba92eb4e43cf4047d7d8bad6e7 to your computer and use it in GitHub Desktop.
メルセンヌ素数2^{521}-1の逆元計算の速度比較
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_and_n(t, XY, p, SIZE); // T mod R
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
mpn_copyi(r, x, SIZE); // r <- x
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^4
for (size_t i = 0; i < 518; i++) {
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- b^2
}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp);
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0, 0};
mp_limb_t b[SIZE] = {0};
const int N = 1000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert1: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert2: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] invert1: 84.242000 usec
[+] invert2: 115.085000 usec
"""
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_and_n(t, XY, p, SIZE); // T mod R
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
mpn_copyi(r, x, SIZE); // r <- x
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp);
mpn_mul_n(tmp, b, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{3}
for (size_t i = 0; i < 518/2; i++) {
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- r^{4}
mpn_mul_n(tmp, r, b, SIZE); // r <- r * x^{3}
mod(r, tmp, p, tp, sp);
}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp);
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0, 0};
mp_limb_t b[SIZE] = {0};
const int N = 1000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert1: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert2: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] invert1: 82.291000 usec
[+] invert2: 83.924000 usec
"""
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_and_n(t, XY, p, SIZE); // T mod R
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{2}
mpn_sqr(tmp, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{4}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{6}
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{7}
mpn_copyi(b, (const mp_limb_t*)r, SIZE); // b <- x^{7}
for (size_t i = 0; i < 516/3; i++) {
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- r^{8}
mpn_mul_n(tmp, r, b, SIZE); // r <- r * x^{7}
mod(r, tmp, p, tp, sp);
}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp);
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0, 0};
mp_limb_t b[SIZE] = {0};
const int N = 1000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert1: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert2: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] invert1: 67.267000 usec
[+] invert2: 71.901000 usec
"""
# for ((i=0; i<10; i++)); do ./a.out; done
# で繰り返すとmpn_invertは速くなっていってた CPUが寝てたのかな
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_zero(t, SIZE*2);
mpn_zero(s, SIZE*2);
mpn_and_n(t, XY, p, SIZE); // T mod R
#if 0
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
#else
for (size_t i = 0; i < SIZE; i++) {
s[i+8] = t[i];
}
mpn_lshift(s, (const mp_limb_t*)s, SIZE*2, 9);
mpn_sub_n(s, (const mp_limb_t*)s, (const mp_limb_t*)t, SIZE*2); // (T mod R)*(N+1) - (T mod R)
#endif
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{2}
mpn_mul_n(tmp, b, x, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{3}
mpn_sqr(tmp, r, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{12}
mpn_mul_n(tmp, b, r, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{15}
mpn_copyi(r, (const mp_limb_t*)b, SIZE); // r <- x^{15}
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{30}
mpn_mul_n(tmp, b, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{31}
for (size_t i = 0; i < 515/5; i++) {
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // 1
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // 2
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // 3
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // 4
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- r^{32}
mpn_mul_n(tmp, r, b, SIZE); // r <- r * x^{31}
mod(r, tmp, p, tp, sp);
}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp);
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0, 0};
mp_limb_t b[SIZE] = {0};
const int N = 1000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert1: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert2: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] invert1: 67.029000 usec
[+] invert2: 58.144000 usec
"""
# modにmul使ってたの見落としてた. 変えたら圧倒的勝利できた.
# たぶん3bitsでもいい勝負になる
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_zero(t, SIZE*2);
mpn_zero(s, SIZE*2);
mpn_and_n(t, XY, p, SIZE); // T mod R
#if 0
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
#else
for (size_t i = 0; i < SIZE; i++) {
s[i+8] = t[i];
}
mpn_lshift(s, (const mp_limb_t*)s, SIZE*2, 9);
mpn_sub_n(s, (const mp_limb_t*)s, (const mp_limb_t*)t, SIZE*2); // (T mod R)*(N+1) - (T mod R)
#endif
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{2}
mpn_mul_n(tmp, b, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{3}
mpn_sqr(tmp, b, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{12}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{15}
mpn_sqr(tmp, r, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{240}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{255}
mpn_sqr(tmp, r, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{65280}
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{130560}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{510}
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{511}
mpn_mul_n(tmp, b, r, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{131071}
for (size_t i = 0; i < (519-9)/17; i++) {
for (size_t t = 0; t < 17; t++) {
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <-
}
mpn_mul_n(tmp, r, b, SIZE); // r <- r * x^{131071}
mod(r, tmp, p, tp, sp);
}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp);
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0, 0};
mp_limb_t b[SIZE] = {0};
const int N = 10000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert1: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert2: %f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] mpn_sec_invert: 62.686600 usec
[+] mpz_invert: 1.582700 usec
[+] invert: 50.712900 usec
"""
# mpz_invertも追加してみた. 速すぎる.
# 自分で実装するmpn_invertは多分これが一番速いんじゃないかなぁ..
# もっと速くするならmulMod, sqrModを速くするとよい.
#include <iostream>
#include <time.h>
#include <gmpxx.h>
#define SIZE 9
static void set_mpz_t(mpz_t& z, const uint64_t* p, int n) {
int s = n;
while (s > 0) {
if (p[s - 1]) break;
s--;
}
z->_mp_alloc = n;
z->_mp_size = s;
z->_mp_d = (mp_limb_t*)const_cast<uint64_t*>(p);
}
void dump(const uint64_t *p, int n) {
mpz_t t;
set_mpz_t(t, p, n);
std::cout << t << std::endl;
}
void dump(const uint64_t *p) {
mpz_t t;
set_mpz_t(t, p, SIZE);
std::cout << t << std::endl;
}
static inline void mod(mp_limb_t *z, const mp_limb_t *XY, const mp_limb_t *p, mp_limb_t *t, mp_limb_t *s) {
// (T + (T mod R)*N) / R
mpn_zero(t, SIZE*2);
mpn_zero(s, SIZE*2);
mpn_and_n(t, XY, p, SIZE); // T mod R
#if 0
mpn_mul_n(s, (const mp_limb_t*)t, p, SIZE);
#else
for (size_t i = 0; i < SIZE; i++) {
s[i+8] = t[i];
}
mpn_lshift(s, (const mp_limb_t*)s, SIZE*2, 9);
mpn_sub_n(s, (const mp_limb_t*)s, (const mp_limb_t*)t, SIZE*2); // (T mod R)*(N+1) - (T mod R)
#endif
mpn_add_n(t, (const mp_limb_t*)s, XY, SIZE*2); // (T + (T mod R)*N)
mpn_rshift(t, (const mp_limb_t*)t, SIZE*2, 9);
for (size_t i = 0; i < SIZE; i++) { // (T + (T mod R)*N) / R
t[i] = t[i+8];
}
if (mpn_cmp((const mp_limb_t*)t, p, SIZE) >= 0) {
mpn_sub_n(t, (const mp_limb_t*)t, p, SIZE);
}
mpn_copyi(z, (const mp_limb_t*)t, SIZE);
}
void invert1(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
mp_limb_t tp[mpn_sec_invert_itch(SIZE)];
mp_limb_t ap[SIZE];
mpn_copyi(ap, x, SIZE);
//mpn_sec_invert(r, ap, p, SIZE, 2*SIZE*GMP_NUMB_BITS, tp);
mpn_sec_invert(r, ap, p, SIZE, 521*2, tp);
}
void invert2(mp_limb_t *r, const mp_limb_t* x, const mp_limb_t* p) {
#if 0
mp_limb_t tp[SIZE*2];
mp_limb_t sp[SIZE*2];
mp_limb_t tmp[SIZE*2];
mp_limb_t b[SIZE];
#else
mp_limb_t *tp = new mp_limb_t[SIZE*2];
mp_limb_t *sp = new mp_limb_t[SIZE*2];
mp_limb_t *tmp = new mp_limb_t[SIZE*2];
mp_limb_t *b = new mp_limb_t[SIZE];
#endif
mpn_sqr(tmp, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{2}
mpn_mul_n(tmp, b, x, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{3}
mpn_sqr(tmp, b, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{12}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{15}
mpn_sqr(tmp, r, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{240}
mpn_mul_n(tmp, r, b, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{255}
mpn_sqr(tmp, r, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp);
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{65280}
mpn_sqr(tmp, b, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{130560}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{510}
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp); // r <- x^{511}
mpn_mul_n(tmp, b, r, SIZE);
mod(b, tmp, p, tp, sp); // b <- x^{131071}
for (size_t i = 0; i < (519-9)/17; i++) {
for (size_t t = 0; t < 17; t++) {
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp); // r <-
}
mpn_mul_n(tmp, r, b, SIZE); // r <- r * x^{131071}
mod(r, tmp, p, tp, sp);
}
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_sqr(tmp, r, SIZE);
mod(r, tmp, p, tp, sp);
mpn_mul_n(tmp, r, x, SIZE);
mod(r, tmp, p, tp, sp);
}
void invert3(mpz_t &z, const mpz_t &x, const mpz_t &p) {
mpz_invert(z, x, p);
}
void invert4(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *p) {
mp_limb_t g[SIZE];
mp_limb_t u[SIZE];
mp_limb_t v[SIZE];
mp_limb_t s[SIZE+1];
mp_size_t sn;
mpn_copyi(u, x, SIZE);
mpn_copyi(v, p, SIZE);
mpn_gcdext(g, s, &sn, u, SIZE, v, SIZE);
mpn_copyi(z, (const mp_limb_t*)s, SIZE);
if (sn < 0) {
mpn_sub_n(z, (const mp_limb_t*)p, (const mp_limb_t*)z, SIZE);
}
}
int main() {
mp_limb_t p[SIZE] = {
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
0x1ff
};
mp_limb_t a[SIZE] = {
0x9164821638291631L, 0x1372917431694363L, 0x9278732747374837L, 0x2913789724789173L,
0x6392641693692173L, 0xFFFFFFFFF3213729L, 0xDEADBEEFDEADBEEFL, 0x2913789724789173L, 0x1F};
mp_limb_t b[SIZE] = {0};
#if 0
dump(p);
dump(a);
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
dump(b);
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
dump(b);
invert4(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
dump(b);
return 0;
#endif
const int N = 10000;
time_t begin, end;
begin = clock();
for (int i = 0; i < N; i++) {
invert1(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] mpn_sec_invert: \t%f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
mpz_t z, x, mod;
mpz_init(z);
set_mpz_t(x, (const uint64_t*)a, SIZE);
set_mpz_t(mod, (const uint64_t*)p, SIZE);
begin = clock();
for (int i = 0; i < N; i++) {
invert3(z, x, mod);
}
end = clock();
printf("[+] mpz_invert: \t%f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert2(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert(pow): \t%f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
begin = clock();
for (int i = 0; i < N; i++) {
invert4(b, (const mp_limb_t*)a, (const mp_limb_t*)p);
}
end = clock();
printf("[+] invert(extgcd): \t%f usec\n", double(end-begin)*1e6/CLOCKS_PER_SEC/N);
}
$ g++ pow521.cpp -lgmpxx -lgmp -Ofast && ./a.out
"""
[+] mpn_sec_invert: 62.267000 usec
[+] mpz_invert: 1.634500 usec
[+] invert(pow): 51.293900 usec
[+] invert(extgcd): 1.523400 usec
"""
# mpn_gcdextも追加した. mpz_invertよりも速い.
# mpn_sec_がつく関数は遅い.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment