Last active
June 7, 2021 02:36
-
-
Save Hermann-SW/83c8ab9e10a0bb64d770af543ed08445 to your computer and use it in GitHub Desktop.
compares runtimes of 1 million additions of squares of 128bit numbers (my uint256_t / gmplib / ttmath)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
$ g++ -Ittmath -O3 -Wall -Wextra -pedantic -fomit-frame-pointer -m64 -mtune=skylake -march=broadwell -o sqr sqr.cpp -lgmp | |
$ for((i=0; i<5; ++i)); do ./sqr ; done | grep ns | sort -n | |
2786120ns(1) | |
2955138ns(1) | |
3052486ns(1) | |
3305607ns(1) | |
6627298ns(1) | |
14803914ns(3) | |
15377249ns(3) | |
15396184ns(3) | |
15749061ns(3) | |
16054087ns(2) | |
16415241ns(2) | |
16507217ns(2) | |
16900889ns(2) | |
17525356ns(3) | |
24720449ns(2) | |
$ | |
*/ | |
#include <ttmath/ttmath.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <inttypes.h> | |
#include <assert.h> | |
#include <time.h> | |
#include <sys/time.h> | |
#include "gmp.h" | |
#define N 1000000 | |
#define UINT128_C(u) ((__uint128_t)u) | |
#define UINT128(h, l) (UINT128_C(h)<<64 | l) | |
void pu64(__uint64_t u) { printf("%" PRIu64, u); } | |
void pu640(__uint64_t u) { printf("%019" PRIu64, u); } | |
#define D19_ UINT64_C(10000000000000000000) | |
const __uint128_t d19_ = D19_; | |
const __uint128_t d38_ = (UINT128_C(D19_)*D19_); | |
const __uint128_t UINT128_MAX = UINT128(UINT64_MAX, UINT64_MAX); | |
void pu128(__uint128_t u) | |
{ | |
if (u < d19_) pu64(u); | |
else if (u < d38_) { pu64(u/d19_); pu640(u%d19_); } | |
else { pu64(u/d38_); u%=d38_; pu640(u/d19_); pu640(u%d19_); } | |
} | |
typedef __uint128_t uint256_t[2]; | |
#define UINT256(h, l) { l, h } | |
const uint256_t UINT256_MAX = { UINT128_MAX, UINT128_MAX }; | |
void q(long l) { printf("%016lX ", l); } | |
void p(long l) { printf("%lX", l); } | |
void P(__uint128_t i) { if (i>UINT64_MAX) p(i>>64); q(i & 0xFFFFFFFFFFFFFFFF); } | |
void _P_256(const uint256_t x) { if (x[1]>0) P(x[1]); P(x[0]); } | |
#define P_256(l) { printf("%s: ", #l); _P_256(l); } | |
void set_256_(uint256_t d, const uint256_t s) { d[0]=s[0]; d[1]=s[1]; } | |
void set_256(uint256_t d, __uint128_t l, __uint128_t h) { d[0]=l; d[1]=h; } | |
void add_128(uint256_t d, uint256_t x, __uint128_t a) | |
{ | |
d[0] = x[0] + a; | |
d[1] = (d[0] < a) ? x[1]+1 : x[1]; | |
} | |
void add_256(uint256_t d, uint256_t x, const uint256_t a) | |
{ | |
add_128(d, x, a[0]); | |
d[1] += a[1]; | |
} | |
void sub_128(uint256_t d, uint256_t x, __uint128_t a) | |
{ | |
if (x[0] < a) x[1] -= 1; | |
d[0] = x[0] - a; | |
} | |
void sub_256(uint256_t d, uint256_t x, const uint256_t a) | |
{ | |
sub_128(d, x, a[0]); | |
d[1] -= a[1]; | |
} | |
int lt_256(const uint256_t a, const uint256_t b) | |
{ | |
return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<b[0])); | |
} | |
int le_256(const uint256_t a, const uint256_t b) | |
{ | |
return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<=b[0])); | |
} | |
void shl_256(uint256_t d, long s) | |
{ | |
d[1] <<= s; | |
d[1] += (d[0] >> (128-s)); | |
d[0] <<= s; | |
} | |
void mul10(uint256_t d, uint256_t x) | |
{ | |
uint256_t t = { x[0], x[1] }; | |
shl_256(t, 2); | |
add_256(d, x, t); | |
shl_256(d, 1); | |
} | |
const uint256_t UINT256_MAX_10th = UINT256( UINT128(0x1999999999999999, 0x9999999999999999), UINT128(0x9999999999999999, 0x999999999999999A) ); | |
void pu256_(uint256_t v, uint256_t t, const uint256_t o) | |
{ | |
if (!lt_256(v, t) && le_256(o, UINT256_MAX_10th)) | |
{ | |
uint256_t nt, no = { t[0], t[1] }; | |
mul10(nt, t); | |
pu256_(v, nt, no); | |
} | |
char d = '0'; | |
while (le_256(o, v)) | |
{ | |
sub_256(v, v, o); | |
++d; | |
} | |
putchar(d); | |
} | |
void pu256(const uint256_t u) | |
{ | |
if ((u[1]==0) && (u[0]==0)) putchar('0'); | |
else | |
{ | |
uint256_t v = { u[0], u[1] }, t = UINT256( 0, 10 ), o = UINT256( 0, 1 ); | |
pu256_(v, t, o); | |
} | |
} | |
void sqr_128(uint256_t d, const __uint128_t x) | |
{ | |
if (x <= UINT64_MAX) | |
{ | |
d[0] = x*x; | |
d[1] = 0; | |
} | |
else | |
{ | |
__uint128_t l = x & UINT64_MAX; | |
__uint128_t h = x >> 64; | |
uint256_t yz, w; | |
set_256(d, l * l, 0); | |
set_256(yz, l * h, 0); | |
shl_256(yz, 64+1); | |
set_256(w, 0, h * h); | |
add_256(d, d, yz); | |
add_256(d, d, w); | |
} | |
} | |
int main(int argc, char *argv[] __attribute__((unused))) { | |
if (argc>1) { pu256(UINT256_MAX); puts(""); return 0; } | |
struct timespec t0, t1; | |
uint256_t s, b; | |
volatile __uint128_t a = 0x0006000005000000L; | |
a *= a; | |
set_256(s, 0, 0); | |
clock_gettime(CLOCK_REALTIME, &t0); | |
for(int i=0; i<N; ++i) | |
{ | |
sqr_128(b, a); | |
add_256(s, s, b); | |
} | |
clock_gettime(CLOCK_REALTIME, &t1); | |
printf("%ldns(1)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec); | |
P_256(s); puts(""); | |
pu256(s); puts(""); | |
{ | |
mpz_t a,s,t; | |
mpz_init(a); | |
mpz_init_set_si(s, 0); | |
mpz_init_set_si(t, 0x0006000050000000L); | |
mpz_mul(a, t, t); | |
clock_gettime(CLOCK_REALTIME, &t0); | |
for(int i=0; i<N; ++i) | |
{ | |
mpz_mul(t, a, a); | |
mpz_add(s, s, t); | |
} | |
clock_gettime(CLOCK_REALTIME, &t1); | |
printf("%ldns(2)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec); | |
gmp_printf ("%ZX\n", s); | |
gmp_printf ("%Zd\n", s); | |
} | |
{ | |
// a,b are from <0, 2^256 - 1> | |
ttmath::UInt<4> s; | |
ttmath::UInt<4> a; | |
a = 0x0006000050000000L; | |
a *= a; | |
s = 0; | |
clock_gettime(CLOCK_REALTIME, &t0); | |
for(int i=0; i<N; ++i) | |
s += a*a; | |
clock_gettime(CLOCK_REALTIME, &t1); | |
printf("%ldns(3)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec); | |
std::cout << s << std::endl; | |
} | |
return 0; | |
} |
I added const declarations for function args where possible.
New function "pu256()" for printing 256bit number decimally is discussed here:
https://stackoverflow.com/a/67865257/5674289
if (argc>1) { pu256(UINT256_MAX); puts(""); return 0; }
$ ./sqr 1
115792089237316195423570985008687907853269984665640564039457584007913129639935
$
Calling "sqr" without args gives the benchmarking output for uint256_t/gmplib/ttmath:
$ ./sqr
6244884ns(1)
s: 4D3F65017DF941DD76B2D05E 2540BE40000000000000000000000000
8135125465365149450951398141698102514929168687243244953665536000000
25238723ns(2)
4D3F7417E0C1DD812D060540BE400000000000000000000000000000
8135149709954218948301673480404660644185484305479022816526336000000
17200108ns(3)
8135149709954218948301673480404660644185484305479022816526336000000
$
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For discussion in https://stackoverflow.com/a/67839515/5674289
ttmath shows speedup 1.08 over gmplib.
I prefer my uint256_t showing speedup 5.76 over gmplib.
(on 64bit i7 "ttmath::UInt<4>" is 256bit number; https://ttmath.org)