mrkn/bigmul1_toom3.patch

## bigmul1_toom3.patch
diff --git a/bignum.c b/bignum.c
index 152018e..046691e 100644
--- a/bignum.c
+++ b/bignum.c
@@ -22,6 +22,8 @@

 VALUE rb_cBignum;

+static VALUE big_three = Qnil;
+
 #if defined __MINGW32__
 #define USHORT _USHORT
 #endif
@@ -29,6 +31,7 @@ VALUE rb_cBignum;
 #define BDIGITS(x) (RBIGNUM_DIGITS(x))
 #define BITSPERDIG (SIZEOF_BDIGITS*CHAR_BIT)
 #define BIGRAD ((BDIGIT_DBL)1 << BITSPERDIG)
+#define BIGRAD_HALF ((BDIGIT)(BIGRAD >> 1))
 #define DIGSPERLONG (SIZEOF_LONG/SIZEOF_BDIGITS)
 #if HAVE_LONG_LONG
 # define DIGSPERLL (SIZEOF_LONG_LONG/SIZEOF_BDIGITS)
@@ -42,6 +45,31 @@ VALUE rb_cBignum;
 		     (BDIGITS(x)[0] == 0 && \
 		      (RBIGNUM_LEN(x) == 1 || bigzero_p(x))))

+#define BIGNUM_DEBUG 0
+#if BIGNUM_DEBUG
+#define ON_DEBUG(x) do { x; } while (0)
+static void
+dump_bignum(VALUE x)
+{
+    long i;
+    printf("%c0x0", RBIGNUM_SIGN(x) ? '+' : '-');
+    for (i = RBIGNUM_LEN(x); i--; ) {
+	printf("_%08"PRIxBDIGIT, BDIGITS(x)[i]);
+    }
+    printf(", len=%lu", RBIGNUM_LEN(x));
+    puts("");
+}
+
+static VALUE
+rb_big_dump(VALUE x)
+{
+    dump_bignum(x);
+    return x;
+}
+#else
+#define ON_DEBUG(x)
+#endif
+
 static int
 bigzero_p(VALUE x)
 {
@@ -1683,7 +1711,7 @@ bigsub(VALUE x, VALUE y)
     long i = RBIGNUM_LEN(x);
     BDIGIT *xds, *yds;

-    /* if x is larger than y, swap */
+    /* if x is smaller than y, swap */
     if (RBIGNUM_LEN(x) < RBIGNUM_LEN(y)) {
 	z = x; x = y; y = z;	/* swap x y */
     }
@@ -2019,7 +2047,7 @@ bigmul1_balance(VALUE x, VALUE y)

     xn = RBIGNUM_LEN(x);
     yn = RBIGNUM_LEN(y);
-    assert(2 * xn <= yn);
+    assert(2 * xn <= yn || 3 * xn <= 2*(yn+2));

     z = bignew(xn + yn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
     t1 = bignew(xn, 1);
@@ -2059,10 +2087,15 @@ big_split(VALUE v, long n, volatile VALUE *ph, volatile VALUE *pl)
 	ln = n;
     }

-    while (--hn && !vds[hn + ln]);
-    h = bignew(hn += 2, 1);
-    MEMCPY(BDIGITS(h), vds + ln, BDIGIT, hn - 1);
-    BDIGITS(h)[hn - 1] = 0; /* margin for carry */
+    if (!hn) {
+	h = rb_uint2big(0);
+    }
+    else {
+	while (--hn && !vds[hn + ln]);
+	h = bignew(hn += 2, 1);
+	MEMCPY(BDIGITS(h), vds + ln, BDIGIT, hn - 1);
+	BDIGITS(h)[hn - 1] = 0; /* margin for carry */
+    }

     while (--ln && !vds[ln]);
     l = bignew(ln += 2, 1);
@@ -2169,6 +2202,277 @@ bigmul1_karatsuba(VALUE x, VALUE y)
     return z;
 }

+static void
+biglsh_bang(BDIGIT *xds, long xn, unsigned long shift)
+{
+    long const s1 = shift/BITSPERDIG;
+    int const s2 = (int)(shift%BITSPERDIG);
+    int const s3 = BITSPERDIG-s2;
+    BDIGIT* zds;
+    BDIGIT num;
+    long i;
+    if (s1 >= xn) {
+	MEMZERO(xds, BDIGIT, xn);
+	return;
+    }
+    zds = xds + xn - 1;
+    xn -= s1 + 1;
+    num = xds[xn]<<s2;
+    do {
+	*zds-- = num | xds[--xn]>>s3;
+	num = xds[xn]<<s2;
+    }
+    while (xn > 0);
+    *zds = num;
+    for (i = s1; i > 0; --i)
+	*zds-- = 0;
+}
+
+static void
+bigrsh_bang(BDIGIT* xds, long xn, unsigned long shift)
+{
+    long s1 = shift/BITSPERDIG;
+    int s2 = (int)(shift%BITSPERDIG);
+    int s3 = BITSPERDIG - s2;
+    int i;
+    BDIGIT num;
+    BDIGIT* zds;
+    if (s1 >= xn) {
+	MEMZERO(xds, BDIGIT, xn);
+	return;
+    }
+
+    i = 0;
+    zds = xds + s1;
+    num = *zds++>>s2;
+    do {
+	xds[i++] = (BDIGIT)(*zds<<s3) | num;
+	num = *zds++>>s2;
+    }
+    while (i < xn - s1 - 1);
+    xds[i] = num;
+    MEMZERO(xds + xn - s1, BDIGIT, s1);
+}
+
+#ifdef BIGNUM_DEBUG
+static VALUE
+rb_big_lsh_bang(VALUE x, VALUE n)
+{
+    unsigned long shift;
+    long m, xn, s1;
+    int s2;
+    BDIGIT* xds = BDIGITS(x);
+
+    shift = NUM2ULONG(n);
+    s1 = shift/BITSPERDIG;
+    s2 = (int)(shift%BITSPERDIG);
+
+    m = 0;
+    xn = RBIGNUM_LEN(x);
+    while (!xds[--xn]);
+    while ((xds[xn] & (BIGRAD_HALF>>m)) == 0) ++m;
+    ++xn;
+
+    if (s1 > RBIGNUM_LEN(x) - xn) xn += s1;
+    if (s2 > m) ++xn;
+    rb_big_resize(x, xn);
+
+    biglsh_bang(xds, xn, shift);
+
+    return x;
+}
+
+static VALUE
+rb_big_rsh_bang(VALUE x, VALUE n)
+{
+    bigrsh_bang(BDIGITS(x), RBIGNUM_LEN(x), NUM2ULONG(n));
+    return x;
+}
+#endif
+
+static void
+big_split3(VALUE v, long n, volatile VALUE* p0, volatile VALUE* p1, volatile VALUE* p2)
+{
+    VALUE v0, v12, v1, v2;
+
+    big_split(v, n, &v12, &v0);
+    big_split(v12, n, &v2, &v1);
+
+    *p0 = bigtrunc(v0);
+    *p1 = bigtrunc(v1);
+    *p2 = bigtrunc(v2);
+}
+
+static VALUE big_lshift(VALUE, unsigned long);
+static VALUE big_rshift(VALUE, unsigned long);
+static VALUE bigdivrem(VALUE, VALUE, volatile VALUE*, volatile VALUE*);
+
+static VALUE
+bigmul1_toom3(VALUE x, VALUE y)
+{
+    long i, n, xn, yn, zn;
+    VALUE x0, x1, x2, y0, y1, y2;
+    VALUE u0, u1, u2, u3, u4, v1, v2, v3;
+    VALUE z0, z1, z2, z3, z4, z, t;
+    BDIGIT* zds;
+
+    xn = RBIGNUM_LEN(x);
+    yn = RBIGNUM_LEN(y);
+    assert(xn <= yn);  /* assume y >= x */
+
+    n = (yn + 2) / 3;
+    big_split3(x, n, &x0, &x1, &x2);
+    if (x == y) {
+	y0 = x0; y1 = x1; y2 = x2;
+    }
+    else big_split3(y, n, &y0, &y1, &y2);
+
+    /*
+     * ref. http://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
+     *
+     * x(b) = x0 * b^0 + x1 * b^1 + x2 * b^2
+     * y(b) = y0 * b^0 + y1 * b^1 + y2 * b^2
+     *
+     * z(b) = x(b) * y(b)
+     * z(b) = z0 * b^0 + z1 * b^1 + z2 * b^2 + z3 * b^3 + z4 * b^4
+     * where:
+     *   z0 = x0 * y0
+     *   z1 = x0 * y1 + x1 * y0
+     *   z2 = x0 * y2 + x1 * y1 + x2 * y0
+     *   z3 = x1 * y2 + x2 * y1
+     *   z4 = x2 * y2
+     *
+     * Toom3 method (a.k.a. Toom-Cook method):
+     * (Step1) calculating 5 points z(b0), z(b1), z(b2), z(b3), z(b4),
+     * where:
+     *   b0 = 0, b1 = 1, b2 = -1, b3 = -2, b4 = inf,
+     *   z(0)   = x(0)   * y(0)   = x0 * y0
+     *   z(1)   = x(1)   * y(1)   = (x0 + x1 + x2) * (y0 + y1 + y2)
+     *   z(-1)  = x(-1)  * y(-1)  = (x0 - x1 + x2) * (y0 - y1 + y2)
+     *   z(-2)  = x(-2)  * y(-2)  = (x0 - 2 * (x1 - 2 * x2)) * (y0 - 2 * (y1 - 2 * y2))
+     *   z(inf) = x(inf) * y(inf) = x2 * y2
+     *
+     * (Step2) interpolating z0, z1, z2, z3, z4, and z5.
+     *
+     * (Step3) Substituting base value into b of the polynomial z(b),
+     */
+
+    /*
+     * [Step1] calculating 5 points z(b0), z(b1), z(b2), z(b3), z(b4)
+     */
+
+    /* u1 <- x0 + x2 */
+    u1 = bigtrunc(bigadd(x0, x2, 1));
+
+    /* x(-1) : u2 <- u1 - x1 = x0 - x1 + x2 */
+    u2 = bigtrunc(bigsub(u1, x1));
+
+    /* x(1) : u1 <- u1 + x1 = x0 + x1 + x2 */
+    u1 = bigtrunc(bigadd(u1, x1, 1));
+
+    /* x(-2) : u3 <- 2 * (u2 + x2) - x0 = x0 - 2 * (x1 - 2 * x2) */
+    u3 = bigadd(u2, x2, 1);
+    if (BDIGITS(u3)[RBIGNUM_LEN(u3)-1] & BIGRAD_HALF) {
+	rb_big_resize(u3, RBIGNUM_LEN(u3) + 1);
+	BDIGITS(u3)[RBIGNUM_LEN(u3)-1] = 0;
+    }
+    biglsh_bang(BDIGITS(u3), RBIGNUM_LEN(u3), 1);
+    u3 = bigtrunc(bigadd(bigtrunc(u3), x0, 0));
+
+    if (x == y) {
+	v1 = u1; v2 = u2; v3 = u3;
+    }
+    else {
+	/* v1 <- y0 + y2 */
+	v1 = bigtrunc(bigadd(y0, y2, 1));
+
+	/* y(-1) : v2 <- v1 - y1 = y0 - y1 + y2 */
+	v2 = bigtrunc(bigsub(v1, y1));
+
+	/* y(1) : v1 <- v1 + y1 = y0 + y1 + y2 */
+	v1 = bigtrunc(bigadd(v1, y1, 1));
+
+	/* y(-2) : v3 <- 2 * (v2 + y2) - y0 = y0 - 2 * (y1 - 2 * y2) */
+	v3 = bigadd(v2, y2, 1);
+	if (BDIGITS(v3)[RBIGNUM_LEN(v3)-1] & BIGRAD_HALF) {
+	    rb_big_resize(v3, RBIGNUM_LEN(v3) + 1);
+	    BDIGITS(v3)[RBIGNUM_LEN(v3)-1] = 0;
+	}
+	biglsh_bang(BDIGITS(v3), RBIGNUM_LEN(v3), 1);
+	v3 = bigtrunc(bigadd(bigtrunc(v3), y0, 0));
+    }
+
+    /* z(0) : u0 <- x0 * y0 */
+    u0 = bigtrunc(bigmul0(x0, y0));
+
+    /* z(1) : u1 <- u1 * v1 */
+    u1 = bigtrunc(bigmul0(u1, v1));
+
+    /* z(-1) : u2 <- u2 * v2 */
+    u2 = bigtrunc(bigmul0(u2, v2));
+
+    /* z(-2) : u3 <- u3 * v3 */
+    u3 = bigtrunc(bigmul0(u3, v3));
+
+    /* z(inf) : u4 <- x2 * y2 */
+    u4 = bigtrunc(bigmul0(x2, y2));
+
+    /* for GC */
+    v1 = v2 = v3 = Qnil;
+
+    /*
+     * [Step2] interpolating z0, z1, z2, z3, z4, and z5.
+     */
+
+    /* z0 <- z(0) == u0 */
+    z0 = u0;
+
+    /* z4 <- z(inf) == u4 */
+    z4 = u4;
+
+    /* z3 <- (z(-2) - z(1)) / 3 == (u3 - u1) / 3 */
+    z3 = bigadd(u3, u1, 0);
+    bigdivrem(z3, big_three, &z3, NULL); /* TODO: optimize */
+    bigtrunc(z3);
+
+    /* z1 <- (z(1) - z(-1)) / 2 == (u1 - u2) / 2 */
+    z1 = bigtrunc(bigadd(u1, u2, 0));
+    bigrsh_bang(BDIGITS(z1), RBIGNUM_LEN(z1), 1);
+
+    /* z2 <- z(-1) - z(0) == u2 - u0 */
+    z2 = bigtrunc(bigadd(u2, u0, 0));
+
+    /* z3 <- (z2 - z3) / 2 + 2 * z(inf) == (z2 - z3) / 2 + 2 * u4 */
+    z3 = bigadd(z2, z3, 0);
+    bigrsh_bang(BDIGITS(z3), RBIGNUM_LEN(z3), 1);
+    t = big_lshift(u4, 1); /* TODO: combining with next addition */
+    z3 = bigtrunc(bigadd(z3, t, 1));
+
+    /* z2 <- z2 + z1 - z(inf) == z2 + z1 - u4 */
+    z2 = bigtrunc(bigadd(z2, z1, 1));
+    z2 = bigtrunc(bigadd(z2, u4, 0));
+
+    /* z1 <- z1 - z3 */
+    z1 = bigtrunc(bigadd(z1, z3, 0));
+
+    /*
+     * [Step3] Substituting base value into b of the polynomial z(b),
+     */
+
+    zn = 6*n + 1;
+    z = bignew(zn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
+    zds = BDIGITS(z);
+    MEMCPY(zds, BDIGITS(z0), BDIGIT, RBIGNUM_LEN(z0));
+    MEMZERO(zds + RBIGNUM_LEN(z0), BDIGIT, zn - RBIGNUM_LEN(z0));
+    bigadd_core(zds +   n, zn -   n, BDIGITS(z1), big_real_len(z1), zds +   n, zn -   n);
+    bigadd_core(zds + 2*n, zn - 2*n, BDIGITS(z2), big_real_len(z2), zds + 2*n, zn - 2*n);
+    bigadd_core(zds + 3*n, zn - 3*n, BDIGITS(z3), big_real_len(z3), zds + 3*n, zn - 3*n);
+    bigadd_core(zds + 4*n, zn - 4*n, BDIGITS(z4), big_real_len(z4), zds + 4*n, zn - 4*n);
+    z = bignorm(z);
+
+    return bignorm(z);
+}
+
 /* efficient squaring (2 times faster than normal multiplication)
  * ref: Handbook of Applied Cryptography, Algorithm 14.16
  *      http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
@@ -2207,6 +2511,7 @@ bigsqr_fast(VALUE x)
 }

 #define KARATSUBA_MUL_DIGITS 70
+#define TOOM3_MUL_DIGITS 150


 /* determine whether a bignum is sparse or not by random sampling */
@@ -2222,19 +2527,6 @@ big_sparse_p(VALUE x)
     return (c <= 1) ? Qtrue : Qfalse;
 }

-#if 0
-static void
-dump_bignum(VALUE x)
-{
-    long i;
-    printf("0x0");
-    for (i = RBIGNUM_LEN(x); i--; ) {
-	printf("_%08x", BDIGITS(x)[i]);
-    }
-    puts("");
-}
-#endif
-
 static VALUE
 bigmul0(VALUE x, VALUE y)
 {
@@ -2267,8 +2559,13 @@ bigmul0(VALUE x, VALUE y)
     /* balance multiplication by slicing y when x is much smaller than y */
     if (2 * xn <= yn) return bigmul1_balance(x, y);

-    /* multiplication by karatsuba method */
-    return bigmul1_karatsuba(x, y);
+    if (xn < TOOM3_MUL_DIGITS) {
+	/* multiplication by karatsuba method */
+	return bigmul1_karatsuba(x, y);
+    }
+    else if (3*xn <= 2*(yn + 2))
+	return bigmul1_balance(x, y);
+    return bigmul1_toom3(x, y);
 }

 /*
@@ -2394,6 +2691,7 @@ bigdivrem(VALUE x, VALUE y, volatile VALUE *divp, volatile VALUE *modp)
 	if (divp) *divp = z;
 	return Qnil;
     }
+
     z = bignew(nx==ny?nx+2:nx+1, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
     zds = BDIGITS(z);
     if (nx==ny) zds[nx+1] = 0;
@@ -3516,5 +3814,16 @@ Init_Bignum(void)
     rb_define_method(rb_cBignum, "odd?", rb_big_odd_p, 0);
     rb_define_method(rb_cBignum, "even?", rb_big_even_p, 0);

+    ON_DEBUG(rb_define_const(rb_cBignum, "BITSPERDIG", LONG2NUM(BITSPERDIG)));
+    ON_DEBUG(rb_define_const(rb_cBignum, "RADIX", ULL2NUM(BIGRAD)));
+    ON_DEBUG(rb_define_const(rb_cBignum, "DIGMAX", ULL2NUM(BDIGMAX)));
+    ON_DEBUG(rb_define_method(rb_cBignum, "lsh!", rb_big_lsh_bang, 1));
+    ON_DEBUG(rb_define_method(rb_cBignum, "rsh!", rb_big_rsh_bang, 1));
+    ON_DEBUG(rb_define_method(rb_cBignum, "dump", rb_big_dump, 0));
+    ON_DEBUG(rb_define_singleton_method(rb_cBignum, "toom3_enabled=", rb_big_set_toom3, 1));
+
     power_cache_init();
+
+    big_three = rb_uint2big(3);
+    rb_gc_register_mark_object(big_three);
 }
diff --git a/include/ruby/defines.h b/include/ruby/defines.h
index 45841da..a379071 100644
--- a/include/ruby/defines.h
+++ b/include/ruby/defines.h
@@ -91,22 +91,44 @@ void xfree(void*);
 # define SIZEOF_BDIGITS SIZEOF_INT
 # define BDIGIT_DBL unsigned LONG_LONG
 # define BDIGIT_DBL_SIGNED LONG_LONG
+# define PRI_BDIGIT_PREFIX ""
+# define PRI_BDIGIT_DBL_PREFIX PRI_LL_PREFIX
 #elif SIZEOF_INT*2 <= SIZEOF_LONG
 # define BDIGIT unsigned int
 # define SIZEOF_BDIGITS SIZEOF_INT
 # define BDIGIT_DBL unsigned long
 # define BDIGIT_DBL_SIGNED long
+# define PRI_BDIGIT_PREFIX ""
+# define PRI_BDIGIT_DBL_PREFIX "l"
 #elif SIZEOF_SHORT*2 <= SIZEOF_LONG
 # define BDIGIT unsigned short
 # define SIZEOF_BDIGITS SIZEOF_SHORT
 # define BDIGIT_DBL unsigned long
 # define BDIGIT_DBL_SIGNED long
+# define PRI_BDIGIT_PREFIX "h"
+# define PRI_BDIGIT_DBL_PREFIX "l"
 #else
 # define BDIGIT unsigned short
 # define SIZEOF_BDIGITS (SIZEOF_LONG/2)
 # define BDIGIT_DBL unsigned long
 # define BDIGIT_DBL_SIGNED long
-#endif
+# define PRI_BDIGIT_PREFIX "h"
+# define PRI_BDIGIT_DBL_PREFIX "l"
+#endif
+
+#define PRIdBDIGIT PRI_BDIGIT_PREFIX"d"
+#define PRIiBDIGIT PRI_BDIGIT_PREFIX"i"
+#define PRIoBDIGIT PRI_BDIGIT_PREFIX"o"
+#define PRIuBDIGIT PRI_BDIGIT_PREFIX"u"
+#define PRIxBDIGIT PRI_BDIGIT_PREFIX"x"
+#define PRIXBDIGIT PRI_BDIGIT_PREFIX"X"
+
+#define PRIdBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"d"
+#define PRIiBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"i"
+#define PRIoBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"o"
+#define PRIuBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"u"
+#define PRIxBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"x"
+#define PRIXBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"X"

 #ifdef INFINITY
 # define HAVE_INFINITY
	diff --git a/bignum.c b/bignum.c
	index 152018e..046691e 100644
	--- a/bignum.c
	+++ b/bignum.c
	@@ -22,6 +22,8 @@

	VALUE rb_cBignum;

	+static VALUE big_three = Qnil;
	+
	#if defined __MINGW32__
	#define USHORT _USHORT
	#endif
	@@ -29,6 +31,7 @@ VALUE rb_cBignum;
	#define BDIGITS(x) (RBIGNUM_DIGITS(x))
	#define BITSPERDIG (SIZEOF_BDIGITS*CHAR_BIT)
	#define BIGRAD ((BDIGIT_DBL)1 << BITSPERDIG)
	+#define BIGRAD_HALF ((BDIGIT)(BIGRAD >> 1))
	#define DIGSPERLONG (SIZEOF_LONG/SIZEOF_BDIGITS)
	#if HAVE_LONG_LONG
	# define DIGSPERLL (SIZEOF_LONG_LONG/SIZEOF_BDIGITS)
	@@ -42,6 +45,31 @@ VALUE rb_cBignum;
	(BDIGITS(x)[0] == 0 && \
	(RBIGNUM_LEN(x) == 1 \|\| bigzero_p(x))))

	+#define BIGNUM_DEBUG 0
	+#if BIGNUM_DEBUG
	+#define ON_DEBUG(x) do { x; } while (0)
	+static void
	+dump_bignum(VALUE x)
	+{
	+ long i;
	+ printf("%c0x0", RBIGNUM_SIGN(x) ? '+' : '-');
	+ for (i = RBIGNUM_LEN(x); i--; ) {
	+ printf("_%08"PRIxBDIGIT, BDIGITS(x)[i]);
	+ }
	+ printf(", len=%lu", RBIGNUM_LEN(x));
	+ puts("");
	+}
	+
	+static VALUE
	+rb_big_dump(VALUE x)
	+{
	+ dump_bignum(x);
	+ return x;
	+}
	+#else
	+#define ON_DEBUG(x)
	+#endif
	+
	static int
	bigzero_p(VALUE x)
	{
	@@ -1683,7 +1711,7 @@ bigsub(VALUE x, VALUE y)
	long i = RBIGNUM_LEN(x);
	BDIGIT xds, yds;

	- /* if x is larger than y, swap */
	+ /* if x is smaller than y, swap */
	if (RBIGNUM_LEN(x) < RBIGNUM_LEN(y)) {
	z = x; x = y; y = z; /* swap x y */
	}
	@@ -2019,7 +2047,7 @@ bigmul1_balance(VALUE x, VALUE y)

	xn = RBIGNUM_LEN(x);
	yn = RBIGNUM_LEN(y);
	- assert(2 * xn <= yn);
	+ assert(2 * xn <= yn \|\| 3 * xn <= 2*(yn+2));

	z = bignew(xn + yn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
	t1 = bignew(xn, 1);
	@@ -2059,10 +2087,15 @@ big_split(VALUE v, long n, volatile VALUE ph, volatile VALUE pl)
	ln = n;
	}

	- while (--hn && !vds[hn + ln]);
	- h = bignew(hn += 2, 1);
	- MEMCPY(BDIGITS(h), vds + ln, BDIGIT, hn - 1);
	- BDIGITS(h)[hn - 1] = 0; /* margin for carry */
	+ if (!hn) {
	+ h = rb_uint2big(0);
	+ }
	+ else {
	+ while (--hn && !vds[hn + ln]);
	+ h = bignew(hn += 2, 1);
	+ MEMCPY(BDIGITS(h), vds + ln, BDIGIT, hn - 1);
	+ BDIGITS(h)[hn - 1] = 0; /* margin for carry */
	+ }

	while (--ln && !vds[ln]);
	l = bignew(ln += 2, 1);
	@@ -2169,6 +2202,277 @@ bigmul1_karatsuba(VALUE x, VALUE y)
	return z;
	}

	+static void
	+biglsh_bang(BDIGIT *xds, long xn, unsigned long shift)
	+{
	+ long const s1 = shift/BITSPERDIG;
	+ int const s2 = (int)(shift%BITSPERDIG);
	+ int const s3 = BITSPERDIG-s2;
	+ BDIGIT* zds;
	+ BDIGIT num;
	+ long i;
	+ if (s1 >= xn) {
	+ MEMZERO(xds, BDIGIT, xn);
	+ return;
	+ }
	+ zds = xds + xn - 1;
	+ xn -= s1 + 1;
	+ num = xds[xn]<<s2;
	+ do {
	+ *zds-- = num \| xds[--xn]>>s3;
	+ num = xds[xn]<<s2;
	+ }
	+ while (xn > 0);
	+ *zds = num;
	+ for (i = s1; i > 0; --i)
	+ *zds-- = 0;
	+}
	+
	+static void
	+bigrsh_bang(BDIGIT* xds, long xn, unsigned long shift)
	+{
	+ long s1 = shift/BITSPERDIG;
	+ int s2 = (int)(shift%BITSPERDIG);
	+ int s3 = BITSPERDIG - s2;
	+ int i;
	+ BDIGIT num;
	+ BDIGIT* zds;
	+ if (s1 >= xn) {
	+ MEMZERO(xds, BDIGIT, xn);
	+ return;
	+ }
	+
	+ i = 0;
	+ zds = xds + s1;
	+ num = *zds++>>s2;
	+ do {
	+ xds[i++] = (BDIGIT)(*zds<<s3) \| num;
	+ num = *zds++>>s2;
	+ }
	+ while (i < xn - s1 - 1);
	+ xds[i] = num;
	+ MEMZERO(xds + xn - s1, BDIGIT, s1);
	+}
	+
	+#ifdef BIGNUM_DEBUG
	+static VALUE
	+rb_big_lsh_bang(VALUE x, VALUE n)
	+{
	+ unsigned long shift;
	+ long m, xn, s1;
	+ int s2;
	+ BDIGIT* xds = BDIGITS(x);
	+
	+ shift = NUM2ULONG(n);
	+ s1 = shift/BITSPERDIG;
	+ s2 = (int)(shift%BITSPERDIG);
	+
	+ m = 0;
	+ xn = RBIGNUM_LEN(x);
	+ while (!xds[--xn]);
	+ while ((xds[xn] & (BIGRAD_HALF>>m)) == 0) ++m;
	+ ++xn;
	+
	+ if (s1 > RBIGNUM_LEN(x) - xn) xn += s1;
	+ if (s2 > m) ++xn;
	+ rb_big_resize(x, xn);
	+
	+ biglsh_bang(xds, xn, shift);
	+
	+ return x;
	+}
	+
	+static VALUE
	+rb_big_rsh_bang(VALUE x, VALUE n)
	+{
	+ bigrsh_bang(BDIGITS(x), RBIGNUM_LEN(x), NUM2ULONG(n));
	+ return x;
	+}
	+#endif
	+
	+static void
	+big_split3(VALUE v, long n, volatile VALUE* p0, volatile VALUE* p1, volatile VALUE* p2)
	+{
	+ VALUE v0, v12, v1, v2;
	+
	+ big_split(v, n, &v12, &v0);
	+ big_split(v12, n, &v2, &v1);
	+
	+ *p0 = bigtrunc(v0);
	+ *p1 = bigtrunc(v1);
	+ *p2 = bigtrunc(v2);
	+}
	+
	+static VALUE big_lshift(VALUE, unsigned long);
	+static VALUE big_rshift(VALUE, unsigned long);
	+static VALUE bigdivrem(VALUE, VALUE, volatile VALUE, volatile VALUE);
	+
	+static VALUE
	+bigmul1_toom3(VALUE x, VALUE y)
	+{
	+ long i, n, xn, yn, zn;
	+ VALUE x0, x1, x2, y0, y1, y2;
	+ VALUE u0, u1, u2, u3, u4, v1, v2, v3;
	+ VALUE z0, z1, z2, z3, z4, z, t;
	+ BDIGIT* zds;
	+
	+ xn = RBIGNUM_LEN(x);
	+ yn = RBIGNUM_LEN(y);
	+ assert(xn <= yn); /* assume y >= x */
	+
	+ n = (yn + 2) / 3;
	+ big_split3(x, n, &x0, &x1, &x2);
	+ if (x == y) {
	+ y0 = x0; y1 = x1; y2 = x2;
	+ }
	+ else big_split3(y, n, &y0, &y1, &y2);
	+
	+ /*
	+ * ref. http://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
	+ *
	+ * x(b) = x0 * b^0 + x1 * b^1 + x2 * b^2
	+ * y(b) = y0 * b^0 + y1 * b^1 + y2 * b^2
	+ *
	+ * z(b) = x(b) * y(b)
	+ * z(b) = z0 * b^0 + z1 * b^1 + z2 * b^2 + z3 * b^3 + z4 * b^4
	+ * where:
	+ * z0 = x0 * y0
	+ * z1 = x0 * y1 + x1 * y0
	+ * z2 = x0 * y2 + x1 * y1 + x2 * y0
	+ * z3 = x1 * y2 + x2 * y1
	+ * z4 = x2 * y2
	+ *
	+ * Toom3 method (a.k.a. Toom-Cook method):
	+ * (Step1) calculating 5 points z(b0), z(b1), z(b2), z(b3), z(b4),
	+ * where:
	+ * b0 = 0, b1 = 1, b2 = -1, b3 = -2, b4 = inf,
	+ * z(0) = x(0) * y(0) = x0 * y0
	+ * z(1) = x(1) * y(1) = (x0 + x1 + x2) * (y0 + y1 + y2)
	+ * z(-1) = x(-1) * y(-1) = (x0 - x1 + x2) * (y0 - y1 + y2)
	+ * z(-2) = x(-2) * y(-2) = (x0 - 2 * (x1 - 2 * x2)) * (y0 - 2 * (y1 - 2 * y2))
	+ * z(inf) = x(inf) * y(inf) = x2 * y2
	+ *
	+ * (Step2) interpolating z0, z1, z2, z3, z4, and z5.
	+ *
	+ * (Step3) Substituting base value into b of the polynomial z(b),
	+ */
	+
	+ /*
	+ * [Step1] calculating 5 points z(b0), z(b1), z(b2), z(b3), z(b4)
	+ */
	+
	+ /* u1 <- x0 + x2 */
	+ u1 = bigtrunc(bigadd(x0, x2, 1));
	+
	+ /* x(-1) : u2 <- u1 - x1 = x0 - x1 + x2 */
	+ u2 = bigtrunc(bigsub(u1, x1));
	+
	+ /* x(1) : u1 <- u1 + x1 = x0 + x1 + x2 */
	+ u1 = bigtrunc(bigadd(u1, x1, 1));
	+
	+ /* x(-2) : u3 <- 2 * (u2 + x2) - x0 = x0 - 2 * (x1 - 2 * x2) */
	+ u3 = bigadd(u2, x2, 1);
	+ if (BDIGITS(u3)[RBIGNUM_LEN(u3)-1] & BIGRAD_HALF) {
	+ rb_big_resize(u3, RBIGNUM_LEN(u3) + 1);
	+ BDIGITS(u3)[RBIGNUM_LEN(u3)-1] = 0;
	+ }
	+ biglsh_bang(BDIGITS(u3), RBIGNUM_LEN(u3), 1);
	+ u3 = bigtrunc(bigadd(bigtrunc(u3), x0, 0));
	+
	+ if (x == y) {
	+ v1 = u1; v2 = u2; v3 = u3;
	+ }
	+ else {
	+ /* v1 <- y0 + y2 */
	+ v1 = bigtrunc(bigadd(y0, y2, 1));
	+
	+ /* y(-1) : v2 <- v1 - y1 = y0 - y1 + y2 */
	+ v2 = bigtrunc(bigsub(v1, y1));
	+
	+ /* y(1) : v1 <- v1 + y1 = y0 + y1 + y2 */
	+ v1 = bigtrunc(bigadd(v1, y1, 1));
	+
	+ /* y(-2) : v3 <- 2 * (v2 + y2) - y0 = y0 - 2 * (y1 - 2 * y2) */
	+ v3 = bigadd(v2, y2, 1);
	+ if (BDIGITS(v3)[RBIGNUM_LEN(v3)-1] & BIGRAD_HALF) {
	+ rb_big_resize(v3, RBIGNUM_LEN(v3) + 1);
	+ BDIGITS(v3)[RBIGNUM_LEN(v3)-1] = 0;
	+ }
	+ biglsh_bang(BDIGITS(v3), RBIGNUM_LEN(v3), 1);
	+ v3 = bigtrunc(bigadd(bigtrunc(v3), y0, 0));
	+ }
	+
	+ /* z(0) : u0 <- x0 * y0 */
	+ u0 = bigtrunc(bigmul0(x0, y0));
	+
	+ /* z(1) : u1 <- u1 * v1 */
	+ u1 = bigtrunc(bigmul0(u1, v1));
	+
	+ /* z(-1) : u2 <- u2 * v2 */
	+ u2 = bigtrunc(bigmul0(u2, v2));
	+
	+ /* z(-2) : u3 <- u3 * v3 */
	+ u3 = bigtrunc(bigmul0(u3, v3));
	+
	+ /* z(inf) : u4 <- x2 * y2 */
	+ u4 = bigtrunc(bigmul0(x2, y2));
	+
	+ /* for GC */
	+ v1 = v2 = v3 = Qnil;
	+
	+ /*
	+ * [Step2] interpolating z0, z1, z2, z3, z4, and z5.
	+ */
	+
	+ /* z0 <- z(0) == u0 */
	+ z0 = u0;
	+
	+ /* z4 <- z(inf) == u4 */
	+ z4 = u4;
	+
	+ /* z3 <- (z(-2) - z(1)) / 3 == (u3 - u1) / 3 */
	+ z3 = bigadd(u3, u1, 0);
	+ bigdivrem(z3, big_three, &z3, NULL); /* TODO: optimize */
	+ bigtrunc(z3);
	+
	+ /* z1 <- (z(1) - z(-1)) / 2 == (u1 - u2) / 2 */
	+ z1 = bigtrunc(bigadd(u1, u2, 0));
	+ bigrsh_bang(BDIGITS(z1), RBIGNUM_LEN(z1), 1);
	+
	+ /* z2 <- z(-1) - z(0) == u2 - u0 */
	+ z2 = bigtrunc(bigadd(u2, u0, 0));
	+
	+ /* z3 <- (z2 - z3) / 2 + 2 * z(inf) == (z2 - z3) / 2 + 2 * u4 */
	+ z3 = bigadd(z2, z3, 0);
	+ bigrsh_bang(BDIGITS(z3), RBIGNUM_LEN(z3), 1);
	+ t = big_lshift(u4, 1); /* TODO: combining with next addition */
	+ z3 = bigtrunc(bigadd(z3, t, 1));
	+
	+ /* z2 <- z2 + z1 - z(inf) == z2 + z1 - u4 */
	+ z2 = bigtrunc(bigadd(z2, z1, 1));
	+ z2 = bigtrunc(bigadd(z2, u4, 0));
	+
	+ /* z1 <- z1 - z3 */
	+ z1 = bigtrunc(bigadd(z1, z3, 0));
	+
	+ /*
	+ * [Step3] Substituting base value into b of the polynomial z(b),
	+ */
	+
	+ zn = 6*n + 1;
	+ z = bignew(zn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
	+ zds = BDIGITS(z);
	+ MEMCPY(zds, BDIGITS(z0), BDIGIT, RBIGNUM_LEN(z0));
	+ MEMZERO(zds + RBIGNUM_LEN(z0), BDIGIT, zn - RBIGNUM_LEN(z0));
	+ bigadd_core(zds + n, zn - n, BDIGITS(z1), big_real_len(z1), zds + n, zn - n);
	+ bigadd_core(zds + 2n, zn - 2n, BDIGITS(z2), big_real_len(z2), zds + 2n, zn - 2n);
	+ bigadd_core(zds + 3n, zn - 3n, BDIGITS(z3), big_real_len(z3), zds + 3n, zn - 3n);
	+ bigadd_core(zds + 4n, zn - 4n, BDIGITS(z4), big_real_len(z4), zds + 4n, zn - 4n);
	+ z = bignorm(z);
	+
	+ return bignorm(z);
	+}
	+
	/* efficient squaring (2 times faster than normal multiplication)
	* ref: Handbook of Applied Cryptography, Algorithm 14.16
	* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
	@@ -2207,6 +2511,7 @@ bigsqr_fast(VALUE x)
	}

	#define KARATSUBA_MUL_DIGITS 70
	+#define TOOM3_MUL_DIGITS 150


	/* determine whether a bignum is sparse or not by random sampling */
	@@ -2222,19 +2527,6 @@ big_sparse_p(VALUE x)
	return (c <= 1) ? Qtrue : Qfalse;
	}

	-#if 0
	-static void
	-dump_bignum(VALUE x)
	-{
	- long i;
	- printf("0x0");
	- for (i = RBIGNUM_LEN(x); i--; ) {
	- printf("_%08x", BDIGITS(x)[i]);
	- }
	- puts("");
	-}
	-#endif
	-
	static VALUE
	bigmul0(VALUE x, VALUE y)
	{
	@@ -2267,8 +2559,13 @@ bigmul0(VALUE x, VALUE y)
	/* balance multiplication by slicing y when x is much smaller than y */
	if (2 * xn <= yn) return bigmul1_balance(x, y);

	- /* multiplication by karatsuba method */
	- return bigmul1_karatsuba(x, y);
	+ if (xn < TOOM3_MUL_DIGITS) {
	+ /* multiplication by karatsuba method */
	+ return bigmul1_karatsuba(x, y);
	+ }
	+ else if (3xn <= 2(yn + 2))
	+ return bigmul1_balance(x, y);
	+ return bigmul1_toom3(x, y);
	}

	/*
	@@ -2394,6 +2691,7 @@ bigdivrem(VALUE x, VALUE y, volatile VALUE divp, volatile VALUE modp)
	if (divp) *divp = z;
	return Qnil;
	}
	+
	z = bignew(nx==ny?nx+2:nx+1, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
	zds = BDIGITS(z);
	if (nx==ny) zds[nx+1] = 0;
	@@ -3516,5 +3814,16 @@ Init_Bignum(void)
	rb_define_method(rb_cBignum, "odd?", rb_big_odd_p, 0);
	rb_define_method(rb_cBignum, "even?", rb_big_even_p, 0);

	+ ON_DEBUG(rb_define_const(rb_cBignum, "BITSPERDIG", LONG2NUM(BITSPERDIG)));
	+ ON_DEBUG(rb_define_const(rb_cBignum, "RADIX", ULL2NUM(BIGRAD)));
	+ ON_DEBUG(rb_define_const(rb_cBignum, "DIGMAX", ULL2NUM(BDIGMAX)));
	+ ON_DEBUG(rb_define_method(rb_cBignum, "lsh!", rb_big_lsh_bang, 1));
	+ ON_DEBUG(rb_define_method(rb_cBignum, "rsh!", rb_big_rsh_bang, 1));
	+ ON_DEBUG(rb_define_method(rb_cBignum, "dump", rb_big_dump, 0));
	+ ON_DEBUG(rb_define_singleton_method(rb_cBignum, "toom3_enabled=", rb_big_set_toom3, 1));
	+
	power_cache_init();
	+
	+ big_three = rb_uint2big(3);
	+ rb_gc_register_mark_object(big_three);
	}
	diff --git a/include/ruby/defines.h b/include/ruby/defines.h
	index 45841da..a379071 100644
	--- a/include/ruby/defines.h
	+++ b/include/ruby/defines.h
	@@ -91,22 +91,44 @@ void xfree(void*);
	# define SIZEOF_BDIGITS SIZEOF_INT
	# define BDIGIT_DBL unsigned LONG_LONG
	# define BDIGIT_DBL_SIGNED LONG_LONG
	+# define PRI_BDIGIT_PREFIX ""
	+# define PRI_BDIGIT_DBL_PREFIX PRI_LL_PREFIX
	#elif SIZEOF_INT*2 <= SIZEOF_LONG
	# define BDIGIT unsigned int
	# define SIZEOF_BDIGITS SIZEOF_INT
	# define BDIGIT_DBL unsigned long
	# define BDIGIT_DBL_SIGNED long
	+# define PRI_BDIGIT_PREFIX ""
	+# define PRI_BDIGIT_DBL_PREFIX "l"
	#elif SIZEOF_SHORT*2 <= SIZEOF_LONG
	# define BDIGIT unsigned short
	# define SIZEOF_BDIGITS SIZEOF_SHORT
	# define BDIGIT_DBL unsigned long
	# define BDIGIT_DBL_SIGNED long
	+# define PRI_BDIGIT_PREFIX "h"
	+# define PRI_BDIGIT_DBL_PREFIX "l"
	#else
	# define BDIGIT unsigned short
	# define SIZEOF_BDIGITS (SIZEOF_LONG/2)
	# define BDIGIT_DBL unsigned long
	# define BDIGIT_DBL_SIGNED long
	-#endif
	+# define PRI_BDIGIT_PREFIX "h"
	+# define PRI_BDIGIT_DBL_PREFIX "l"
	+#endif
	+
	+#define PRIdBDIGIT PRI_BDIGIT_PREFIX"d"
	+#define PRIiBDIGIT PRI_BDIGIT_PREFIX"i"
	+#define PRIoBDIGIT PRI_BDIGIT_PREFIX"o"
	+#define PRIuBDIGIT PRI_BDIGIT_PREFIX"u"
	+#define PRIxBDIGIT PRI_BDIGIT_PREFIX"x"
	+#define PRIXBDIGIT PRI_BDIGIT_PREFIX"X"
	+
	+#define PRIdBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"d"
	+#define PRIiBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"i"
	+#define PRIoBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"o"
	+#define PRIuBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"u"
	+#define PRIxBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"x"
	+#define PRIXBDIGIT_DBL PRI_BDIGIT_DBL_PREFIX"X"

	#ifdef INFINITY
	# define HAVE_INFINITY