Hermann-SW/sqr.cpp

## sqr.cpp
/*
$ g++ -Ittmath -O3 -Wall -Wextra -pedantic -fomit-frame-pointer -m64 -mtune=skylake -march=broadwell -o sqr sqr.cpp -lgmp
$ for((i=0; i<5; ++i)); do ./sqr ; done | grep ns | sort -n
2786120ns(1)
2955138ns(1)
3052486ns(1)
3305607ns(1)
6627298ns(1)
14803914ns(3)
15377249ns(3)
15396184ns(3)
15749061ns(3)
16054087ns(2)
16415241ns(2)
16507217ns(2)
16900889ns(2)
17525356ns(3)
24720449ns(2)
$
*/
#include <ttmath/ttmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>

#include "gmp.h"

#define N 1000000

#define UINT128_C(u)     ((__uint128_t)u)
#define UINT128(h, l)    (UINT128_C(h)<<64 | l)

void pu64(__uint64_t u)   { printf("%" PRIu64, u); }
void pu640(__uint64_t u)  { printf("%019" PRIu64, u); }

#define D19_ UINT64_C(10000000000000000000)
const __uint128_t d19_ = D19_;
const __uint128_t d38_ = (UINT128_C(D19_)*D19_);

const __uint128_t UINT128_MAX = UINT128(UINT64_MAX, UINT64_MAX);

void pu128(__uint128_t u)
{
       if (u < d19_) pu64(u);
  else if (u < d38_) { pu64(u/d19_); pu640(u%d19_); }
  else               { pu64(u/d38_); u%=d38_; pu640(u/d19_); pu640(u%d19_); }
}

typedef __uint128_t uint256_t[2];

#define UINT256(h, l) { l, h }

const uint256_t UINT256_MAX = { UINT128_MAX,  UINT128_MAX };

void q(long l)           { printf("%016lX ", l); }
void p(long l)           { printf("%lX", l); }
void P(__uint128_t i)    { if (i>UINT64_MAX) p(i>>64); q(i & 0xFFFFFFFFFFFFFFFF); }
void _P_256(const uint256_t x) { if (x[1]>0) P(x[1]); P(x[0]); }
#define P_256(l)         { printf("%s: ", #l); _P_256(l); }

void set_256_(uint256_t d, const uint256_t s)  { d[0]=s[0]; d[1]=s[1]; }
void set_256(uint256_t d, __uint128_t l, __uint128_t h)  { d[0]=l; d[1]=h; }

void add_128(uint256_t d, uint256_t x, __uint128_t a)
{
  d[0] = x[0] + a;
  d[1] = (d[0] < a) ? x[1]+1 : x[1];
}
void add_256(uint256_t d, uint256_t x, const uint256_t a)
{
  add_128(d, x, a[0]);
  d[1] += a[1];
}

void sub_128(uint256_t d, uint256_t x, __uint128_t a)
{
  if (x[0] < a)  x[1] -= 1;
  d[0] = x[0] - a;
}
void sub_256(uint256_t d, uint256_t x, const uint256_t a)
{
  sub_128(d, x, a[0]);
  d[1] -= a[1];
}

int lt_256(const uint256_t a, const uint256_t b)
{
  return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<b[0]));
}

int le_256(const uint256_t a, const uint256_t b)
{
  return (a[1]<b[1]) || ((a[1]==b[1]) && (a[0]<=b[0]));
}

void shl_256(uint256_t d, long s)
{
  d[1] <<= s;
  d[1] += (d[0] >> (128-s));
  d[0] <<= s;
}

void mul10(uint256_t d, uint256_t x)
{
  uint256_t t = { x[0], x[1] };
  shl_256(t, 2);
  add_256(d, x, t);
  shl_256(d, 1);
}

const uint256_t UINT256_MAX_10th = UINT256( UINT128(0x1999999999999999, 0x9999999999999999), UINT128(0x9999999999999999, 0x999999999999999A) );

void pu256_(uint256_t v, uint256_t t, const uint256_t o)
{
  if (!lt_256(v, t) && le_256(o, UINT256_MAX_10th))
  {
    uint256_t nt, no = { t[0], t[1] };
    mul10(nt, t);
    pu256_(v, nt, no);
  }
  char d = '0';
  while (le_256(o, v))
  {
    sub_256(v, v, o);
    ++d;
  }
  putchar(d);
}

void pu256(const uint256_t u)
{
  if ((u[1]==0) && (u[0]==0))  putchar('0');
  else
  {
    uint256_t v = { u[0], u[1] }, t = UINT256( 0, 10 ), o = UINT256( 0, 1 );
    pu256_(v, t, o);
  }
}

void sqr_128(uint256_t d, const __uint128_t x)
{
  if (x <= UINT64_MAX)
  {
    d[0] = x*x;
    d[1] = 0;
  }
  else
  {
    __uint128_t l = x & UINT64_MAX;
    __uint128_t h = x >> 64;

    uint256_t yz, w;
    set_256(d,  l * l, 0);
    set_256(yz, l * h, 0);
    shl_256(yz, 64+1);
    set_256(w,  0, h * h);
    add_256(d, d, yz);
    add_256(d, d, w);
  }
}


int main(int argc, char *argv[] __attribute__((unused))) {
  if (argc>1)  { pu256(UINT256_MAX); puts(""); return 0; }

  struct timespec t0, t1;
  uint256_t s, b;
  volatile __uint128_t a = 0x0006000005000000L;
  a *= a;

  set_256(s, 0, 0);
  clock_gettime(CLOCK_REALTIME, &t0);
  for(int i=0; i<N; ++i)
  {
    sqr_128(b, a);
    add_256(s, s, b);
  }
  clock_gettime(CLOCK_REALTIME, &t1);
  printf("%ldns(1)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
  P_256(s); puts("");
  pu256(s); puts("");

  {
    mpz_t a,s,t;

    mpz_init(a);
    mpz_init_set_si(s, 0);
    mpz_init_set_si(t, 0x0006000050000000L);
    mpz_mul(a, t, t);

    clock_gettime(CLOCK_REALTIME, &t0);
    for(int i=0; i<N; ++i)
    {
      mpz_mul(t, a, a);
      mpz_add(s, s, t);
    }
    clock_gettime(CLOCK_REALTIME, &t1);
    printf("%ldns(2)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
    gmp_printf ("%ZX\n", s);
    gmp_printf ("%Zd\n", s);
  }

 {
  // a,b are from <0, 2^256 - 1>
  ttmath::UInt<4> s;
  ttmath::UInt<4> a;

  a = 0x0006000050000000L;
  a *= a;

  s = 0;
  clock_gettime(CLOCK_REALTIME, &t0);
  for(int i=0; i<N; ++i)
    s += a*a;
  clock_gettime(CLOCK_REALTIME, &t1);
  printf("%ldns(3)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
  std::cout << s << std::endl;
 }

  return 0;
}
	/*
	$ g++ -Ittmath -O3 -Wall -Wextra -pedantic -fomit-frame-pointer -m64 -mtune=skylake -march=broadwell -o sqr sqr.cpp -lgmp
	$ for((i=0; i<5; ++i)); do ./sqr ; done \| grep ns \| sort -n
	2786120ns(1)
	2955138ns(1)
	3052486ns(1)
	3305607ns(1)
	6627298ns(1)
	14803914ns(3)
	15377249ns(3)
	15396184ns(3)
	15749061ns(3)
	16054087ns(2)
	16415241ns(2)
	16507217ns(2)
	16900889ns(2)
	17525356ns(3)
	24720449ns(2)
	$
	*/
	#include <ttmath/ttmath.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <inttypes.h>
	#include <assert.h>
	#include <time.h>
	#include <sys/time.h>

	#include "gmp.h"

	#define N 1000000

	#define UINT128_C(u) ((__uint128_t)u)
	#define UINT128(h, l) (UINT128_C(h)<<64 \| l)

	void pu64(__uint64_t u) { printf("%" PRIu64, u); }
	void pu640(__uint64_t u) { printf("%019" PRIu64, u); }

	#define D19_ UINT64_C(10000000000000000000)
	const __uint128_t d19_ = D19_;
	const __uint128_t d38_ = (UINT128_C(D19_)*D19_);

	const __uint128_t UINT128_MAX = UINT128(UINT64_MAX, UINT64_MAX);

	void pu128(__uint128_t u)
	{
	if (u < d19_) pu64(u);
	else if (u < d38_) { pu64(u/d19_); pu640(u%d19_); }
	else { pu64(u/d38_); u%=d38_; pu640(u/d19_); pu640(u%d19_); }
	}

	typedef __uint128_t uint256_t[2];

	#define UINT256(h, l) { l, h }

	const uint256_t UINT256_MAX = { UINT128_MAX, UINT128_MAX };

	void q(long l) { printf("%016lX ", l); }
	void p(long l) { printf("%lX", l); }
	void P(__uint128_t i) { if (i>UINT64_MAX) p(i>>64); q(i & 0xFFFFFFFFFFFFFFFF); }
	void _P_256(const uint256_t x) { if (x[1]>0) P(x[1]); P(x[0]); }
	#define P_256(l) { printf("%s: ", #l); _P_256(l); }

	void set_256_(uint256_t d, const uint256_t s) { d[0]=s[0]; d[1]=s[1]; }
	void set_256(uint256_t d, __uint128_t l, __uint128_t h) { d[0]=l; d[1]=h; }

	void add_128(uint256_t d, uint256_t x, __uint128_t a)
	{
	d[0] = x[0] + a;
	d[1] = (d[0] < a) ? x[1]+1 : x[1];
	}
	void add_256(uint256_t d, uint256_t x, const uint256_t a)
	{
	add_128(d, x, a[0]);
	d[1] += a[1];
	}

	void sub_128(uint256_t d, uint256_t x, __uint128_t a)
	{
	if (x[0] < a) x[1] -= 1;
	d[0] = x[0] - a;
	}
	void sub_256(uint256_t d, uint256_t x, const uint256_t a)
	{
	sub_128(d, x, a[0]);
	d[1] -= a[1];
	}

	int lt_256(const uint256_t a, const uint256_t b)
	{
	return (a[1]<b[1]) \|\| ((a[1]==b[1]) && (a[0]<b[0]));
	}

	int le_256(const uint256_t a, const uint256_t b)
	{
	return (a[1]<b[1]) \|\| ((a[1]==b[1]) && (a[0]<=b[0]));
	}

	void shl_256(uint256_t d, long s)
	{
	d[1] <<= s;
	d[1] += (d[0] >> (128-s));
	d[0] <<= s;
	}

	void mul10(uint256_t d, uint256_t x)
	{
	uint256_t t = { x[0], x[1] };
	shl_256(t, 2);
	add_256(d, x, t);
	shl_256(d, 1);
	}

	const uint256_t UINT256_MAX_10th = UINT256( UINT128(0x1999999999999999, 0x9999999999999999), UINT128(0x9999999999999999, 0x999999999999999A) );

	void pu256_(uint256_t v, uint256_t t, const uint256_t o)
	{
	if (!lt_256(v, t) && le_256(o, UINT256_MAX_10th))
	{
	uint256_t nt, no = { t[0], t[1] };
	mul10(nt, t);
	pu256_(v, nt, no);
	}
	char d = '0';
	while (le_256(o, v))
	{
	sub_256(v, v, o);
	++d;
	}
	putchar(d);
	}

	void pu256(const uint256_t u)
	{
	if ((u[1]==0) && (u[0]==0)) putchar('0');
	else
	{
	uint256_t v = { u[0], u[1] }, t = UINT256( 0, 10 ), o = UINT256( 0, 1 );
	pu256_(v, t, o);
	}
	}

	void sqr_128(uint256_t d, const __uint128_t x)
	{
	if (x <= UINT64_MAX)
	{
	d[0] = x*x;
	d[1] = 0;
	}
	else
	{
	__uint128_t l = x & UINT64_MAX;
	__uint128_t h = x >> 64;

	uint256_t yz, w;
	set_256(d, l * l, 0);
	set_256(yz, l * h, 0);
	shl_256(yz, 64+1);
	set_256(w, 0, h * h);
	add_256(d, d, yz);
	add_256(d, d, w);
	}
	}


	int main(int argc, char *argv[] __attribute__((unused))) {
	if (argc>1) { pu256(UINT256_MAX); puts(""); return 0; }

	struct timespec t0, t1;
	uint256_t s, b;
	volatile __uint128_t a = 0x0006000005000000L;
	a *= a;

	set_256(s, 0, 0);
	clock_gettime(CLOCK_REALTIME, &t0);
	for(int i=0; i<N; ++i)
	{
	sqr_128(b, a);
	add_256(s, s, b);
	}
	clock_gettime(CLOCK_REALTIME, &t1);
	printf("%ldns(1)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
	P_256(s); puts("");
	pu256(s); puts("");

	{
	mpz_t a,s,t;

	mpz_init(a);
	mpz_init_set_si(s, 0);
	mpz_init_set_si(t, 0x0006000050000000L);
	mpz_mul(a, t, t);

	clock_gettime(CLOCK_REALTIME, &t0);
	for(int i=0; i<N; ++i)
	{
	mpz_mul(t, a, a);
	mpz_add(s, s, t);
	}
	clock_gettime(CLOCK_REALTIME, &t1);
	printf("%ldns(2)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
	gmp_printf ("%ZX\n", s);
	gmp_printf ("%Zd\n", s);
	}

	{
	// a,b are from <0, 2^256 - 1>
	ttmath::UInt<4> s;
	ttmath::UInt<4> a;

	a = 0x0006000050000000L;
	a *= a;

	s = 0;
	clock_gettime(CLOCK_REALTIME, &t0);
	for(int i=0; i<N; ++i)
	s += a*a;
	clock_gettime(CLOCK_REALTIME, &t1);
	printf("%ldns(3)\n", (t1.tv_sec-t0.tv_sec) * 1000000000 + t1.tv_nsec - t0.tv_nsec);
	std::cout << s << std::endl;
	}

	return 0;
	}