Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Last active June 7, 2021 02:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hermann-SW/83c8ab9e10a0bb64d770af543ed08445 to your computer and use it in GitHub Desktop.
Save Hermann-SW/83c8ab9e10a0bb64d770af543ed08445 to your computer and use it in GitHub Desktop.
compares runtimes of 1 million additions of squares of 128bit numbers (my uint256_t / gmplib / ttmath)
/*
$ g++ -Ittmath -O3 -Wall -Wextra -pedantic -fomit-frame-pointer -m64 -mtune=skylake -march=broadwell -o sqr sqr.cpp -lgmp
$ for((i=0; i<5; ++i)); do ./sqr ; done | grep ns | sort -n
2786120ns(1)
2955138ns(1)
3052486ns(1)
3305607ns(1)
6627298ns(1)
14803914ns(3)
15377249ns(3)
15396184ns(3)
15749061ns(3)
16054087ns(2)
16415241ns(2)
16507217ns(2)
16900889ns(2)
17525356ns(3)
24720449ns(2)
$
*/
#include <ttmath/ttmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include "gmp.h"
#define N 1000000
#define UINT128_C(u) ((__uint128_t)u)
#define UINT128(h, l) (UINT128_C(h)<<64 | l)
void pu64(__uint64_t u) { printf("%" PRIu64, u); }
void pu640(__uint64_t u) { printf("%019" PRIu64, u); }
#define D19_ UINT64_C(10000000000000000000)
const __uint128_t d19_ = D19_;
const __uint128_t d38_ = (UINT128_C(D19_)*D19_);
const __uint128_t UINT128_MAX = UINT128(UINT64_MAX, UINT64_MAX);
void pu128(__uint128_t u)
{
if (u < d19_) pu64(u);
else if (u < d38_) { pu64(u/d19_); pu640(u%d19_); }
else { pu64(u/d38_); u%=d38_; pu640(u/d19_); pu640(u%d19_); }
}
typedef __uint128_t uint256_t[2];
#define UINT256(h, l) { l, h }
const uint256_t UINT256_MAX = { UINT128_MAX, UINT128_MAX };
void q(long l) { printf("%016lX ", l); }
void p(long l) { printf("%lX", l); }
void P(__uint128_t i) { if (i>UINT64_MAX) p(i>>64); q(i & 0xFFFFFFFFFFFFFFFF); }
void _P_256(const uint256_t x) { if (x[1]>0) P(x[1]); P(x[0]); }
#define P_256(l) { printf("%s: ", #l); _P_256(l); }
void set_256_(uint256_t d, const uint256_t s) { d[0]=s[0]; d[1]=s[1]; }
void set_256(uint256_t d, __uint128_t l, __uint128_t h) { d[0]=l; d[1]=h; }
void add_128(uint256_t d, uint256_t x, __uint128_t a)
{
d[0] = x[0] + a;
d[1] = (d[0] < a) ? x[1]+1 : x[1];
}
void add_256(uint256_t d, uint256_t x, const uint256_t a)
{
add_128(d, x, a[0]);
d[1] += a[1];
}
void sub_128(uint256_t d, uint256_t x, __uint128_t a)
{
if (x[0] < a) x[1] -= 1;
d[0] = x[0] - a;
}
void sub_256(uint256_t d, uint256_t x, const uint256_t a)
{
sub_128(d, x, a[0]);
d[1] -= a[1];
}
int lt_256(const uint256_t a, const uint256_t b)
{
return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<b[0]));
}
int le_256(const uint256_t a, const uint256_t b)
{
return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<=b[0]));
}
void shl_256(uint256_t d, long s)
{
d[1] <<= s;
d[1] += (d[0] >> (128-s));
d[0] <<= s;
}
void mul10(uint256_t d, uint256_t x)
{
uint256_t t = { x[0], x[1] };
shl_256(t, 2);
add_256(d, x, t);
shl_256(d, 1);
}
const uint256_t UINT256_MAX_10th = UINT256( UINT128(0x1999999999999999, 0x9999999999999999), UINT128(0x9999999999999999, 0x999999999999999A) );
void pu256_(uint256_t v, uint256_t t, const uint256_t o)
{
if (!lt_256(v, t) && le_256(o, UINT256_MAX_10th))
{
uint256_t nt, no = { t[0], t[1] };
mul10(nt, t);
pu256_(v, nt, no);
}
char d = '0';
while (le_256(o, v))
{
sub_256(v, v, o);
++d;
}
putchar(d);
}
void pu256(const uint256_t u)
{
if ((u[1]==0) && (u[0]==0)) putchar('0');
else
{
uint256_t v = { u[0], u[1] }, t = UINT256( 0, 10 ), o = UINT256( 0, 1 );
pu256_(v, t, o);
}
}
void sqr_128(uint256_t d, const __uint128_t x)
{
if (x <= UINT64_MAX)
{
d[0] = x*x;
d[1] = 0;
}
else
{
__uint128_t l = x & UINT64_MAX;
__uint128_t h = x >> 64;
uint256_t yz, w;
set_256(d, l * l, 0);
set_256(yz, l * h, 0);
shl_256(yz, 64+1);
set_256(w, 0, h * h);
add_256(d, d, yz);
add_256(d, d, w);
}
}
int main(int argc, char *argv[] __attribute__((unused))) {
if (argc>1) { pu256(UINT256_MAX); puts(""); return 0; }
struct timespec t0, t1;
uint256_t s, b;
volatile __uint128_t a = 0x0006000005000000L;
a *= a;
set_256(s, 0, 0);
clock_gettime(CLOCK_REALTIME, &t0);
for(int i=0; i<N; ++i)
{
sqr_128(b, a);
add_256(s, s, b);
}
clock_gettime(CLOCK_REALTIME, &t1);
printf("%ldns(1)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
P_256(s); puts("");
pu256(s); puts("");
{
mpz_t a,s,t;
mpz_init(a);
mpz_init_set_si(s, 0);
mpz_init_set_si(t, 0x0006000050000000L);
mpz_mul(a, t, t);
clock_gettime(CLOCK_REALTIME, &t0);
for(int i=0; i<N; ++i)
{
mpz_mul(t, a, a);
mpz_add(s, s, t);
}
clock_gettime(CLOCK_REALTIME, &t1);
printf("%ldns(2)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
gmp_printf ("%ZX\n", s);
gmp_printf ("%Zd\n", s);
}
{
// a,b are from <0, 2^256 - 1>
ttmath::UInt<4> s;
ttmath::UInt<4> a;
a = 0x0006000050000000L;
a *= a;
s = 0;
clock_gettime(CLOCK_REALTIME, &t0);
for(int i=0; i<N; ++i)
s += a*a;
clock_gettime(CLOCK_REALTIME, &t1);
printf("%ldns(3)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
std::cout << s << std::endl;
}
return 0;
}
@Hermann-SW
Copy link
Author

Hermann-SW commented Jun 6, 2021

For discussion in https://stackoverflow.com/a/67839515/5674289
ttmath shows speedup 1.08 over gmplib.
I prefer my uint256_t showing speedup 5.76 over gmplib.

(on 64bit i7 "ttmath::UInt<4>" is 256bit number; https://ttmath.org)

@Hermann-SW
Copy link
Author

Hermann-SW commented Jun 7, 2021

I added const declarations for function args where possible.
New function "pu256()" for printing 256bit number decimally is discussed here:
https://stackoverflow.com/a/67865257/5674289

if (argc>1)  { pu256(UINT256_MAX); puts(""); return 0; }

$ ./sqr 1
115792089237316195423570985008687907853269984665640564039457584007913129639935
$

Calling "sqr" without args gives the benchmarking output for uint256_t/gmplib/ttmath:

$ ./sqr
6244884ns(1)
s: 4D3F65017DF941DD76B2D05E 2540BE40000000000000000000000000 
8135125465365149450951398141698102514929168687243244953665536000000
25238723ns(2)
4D3F7417E0C1DD812D060540BE400000000000000000000000000000
8135149709954218948301673480404660644185484305479022816526336000000
17200108ns(3)
8135149709954218948301673480404660644185484305479022816526336000000
$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment