Skip to content

Instantly share code, notes, and snippets.

@rjeschke
Created June 22, 2012 09:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rjeschke/2971698 to your computer and use it in GitHub Desktop.
Save rjeschke/2971698 to your computer and use it in GitHub Desktop.
128 bit floats in software (Java)
/*
* Copyright (C) 2012 René Jeschke <rene_jeschke@yahoo.de>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package jfloat;
import java.math.BigDecimal;
import java.math.BigInteger;
/* This thing is WIP. Currently basic mul/div is working, without
* proper rounding, though. Everything is quick'n'dirty ...
*
* Proof-of-concept and playground.
*
* 128 Bit: (4 * UI32)
* 1 sign
* 15 bit exponent, bias: 16383
* 112 mantissa
*/
public class Float128
{
private final static BigDecimal BD_2 = new BigDecimal(2);
private final static BigDecimal MAN_DIV = new BigDecimal(new BigInteger("10000000000000000000000000000", 16));
public static int getSignBit(int[] num)
{
return num[0] & 0x80000000;
}
public static int getExponent(int[] num)
{
return ((num[0] >> 16) & 0x7fff) - 16383;
}
private static void carryAdd4(int[] a, int carry)
{
int c = carry;
if(c == 0)
return;
for(int i = 3; i >= 0; i--)
{
long t = (a[i] & 0xffffffffL) + c;
a[i] = (int)t;
c = (int)(t >> 32);
if(c == 0)
return;
}
}
private static int getSetBit8(int t)
{
if((t & 0x80) != 0)
return 8;
if((t & 0x40) != 0)
return 7;
if((t & 0x20) != 0)
return 6;
if((t & 0x10) != 0)
return 5;
if((t & 0x08) != 0)
return 4;
if((t & 0x04) != 0)
return 3;
if((t & 0x02) != 0)
return 2;
if((t & 0x01) != 0)
return 1;
return 0;
}
private static int getSetBit(int t)
{
if(t == 0)
return 0;
if((t & 0xff000000) != 0)
return getSetBit8(t >> 24) + 24;
if((t & 0x00ff0000) != 0)
return getSetBit8(t >> 16) + 16;
if((t & 0x0000ff00) != 0)
return getSetBit8(t >> 8) + 8;
if((t & 0x000000ff) != 0)
return getSetBit8(t);
return 0;
}
public static int[] div(int[] a, int[] b, int[] r)
{
final int sign = (a[0] ^ b[0]) & 0x80000000;
final int e0 = (a[0] >> 16) & 0x7fff;
final int e1 = (b[0] >> 16) & 0x7fff;
// We flush denormals automatically
if(e0 == 0)
{
r[0] = r[1] = r[2] = r[3] = 0;
return r;
}
// For now we throw up
if(e1 == 0)
{
throw new ArithmeticException("The mighty Neet did not yet define division by zero");
}
// k is either 7 or 8 (2^k bits of precision, so 7 should be enough)
// N = mant(a) >> 1
// D = mand(b) >> 1
// e = 1 - D
// Q = N
// for(i=0, k-1)
// Q = Q + Q * e
// e = e * e
// end
int[] e = new int[4];
int[] q = new int[4];
// 8.120 bit fixed point
e[0] = ((((b[0] & 0xffff) | 0x10000) << 7) | (b[1] >>> 25)) ^ 0xffffff;
e[1] = ((b[1] << 7) | (b[2] >>> 25)) ^ -1;
e[2] = ((b[2] << 7) | (b[3] >>> 25)) ^ -1;
e[3] = (b[3] << 7) ^ -1;
carryAdd4(e, 1);
q[0] = (((a[0] & 0xffff) | 0x10000) << 7) | (a[1] >>> 25);
q[1] = (a[1] << 7) | (a[2] >>> 25);
q[2] = (a[2] << 7) | (a[3] >>> 25);
q[3] = (a[3] << 7);
int[] ret = new int[8];
for(int n = 0; n < 7; n++)
{
long t;
mul8_120(q, e, ret);
ret[0] = (ret[0] << 8) | (ret[1] >>> 24);
ret[1] = (ret[1] << 8) | (ret[2] >>> 24);
ret[2] = (ret[2] << 8) | (ret[3] >>> 24);
ret[3] = (ret[3] << 8) | (ret[4] >>> 24);
t = (q[3] & 0xffffffffL) + (ret[3] & 0xffffffffL);
q[3] = (int)t;
t = (q[2] & 0xffffffffL) + (ret[2] & 0xffffffffL) + (t >>> 32);
q[2] = (int)t;
t = (q[1] & 0xffffffffL) + (ret[1] & 0xffffffffL) + (t >>> 32);
q[1] = (int)t;
t = (q[0] & 0xffffffffL) + (ret[0] & 0xffffffffL) + (t >>> 32);
q[0] = (int)t;
mul8_120(e, e, ret);
e[0] = (ret[0] << 8) | (ret[1] >>> 24);
e[1] = (ret[1] << 8) | (ret[2] >>> 24);
e[2] = (ret[2] << 8) | (ret[3] >>> 24);
e[3] = (ret[3] << 8) | (ret[4] >>> 24);
}
final int sb = getSetBit(q[0]);
final int exp = e0 - e1 + sb + 16383 - 25;
if(exp <= 0)
{
r[0] = r[1] = r[2] = r[3] = 0;
return r;
}
// TODO +/- inf
final int s0 = sb - 17;
final int s1 = 32 - s0;
r[0] = sign | (exp << 16) | ((q[0] >> s0) & 0xffff);
r[1] = (q[0] << s1) | (q[1] >>> s0);
r[2] = (q[1] << s1) | (q[2] >>> s0);
r[3] = (q[2] << s1) | (q[3] >>> s0);
return r;
}
public static int[] mul8_120(int[] a, int[] b, int[] ret)
{
long t, carry, sum;
t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
carry = t >>> 32;
ret[7] = (int)t;
t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
carry += sum >>> 32;
ret[6] = (int)sum;
t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[5] = (int)sum;
t = (a[0] & 0xffffffffL) * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[3] & 0xffffffffL) * (b[0] & 0xffffffffL);
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[4] = (int)sum;
t = (a[0] & 0xffffffffL) * (b[2] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[2] & 0xffffffffL) * (b[0] & 0xffffffffL);
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[3] = (int)sum;
t = (a[0] & 0xffffffffL) * (b[1] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * (b[0] & 0xffffffffL);
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[2] = (int)sum;
t = (a[0] & 0xffffffffL) * (b[0] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
ret[1] = (int)sum;
ret[0] = (int)((t >>> 32) + (sum >>> 32));
return ret;
}
public static int[] mul(int[] a, int[] b, int[] r)
{
final int sign = (a[0] ^ b[0]) & 0x80000000;
final int e0 = (a[0] >> 16) & 0x7fff;
final int e1 = (b[0] >> 16) & 0x7fff;
// We flush denormals automatically
if(e0 == 0 || e1 == 0)
{
r[0] = r[1] = r[2] = r[3] = 0;
return r;
}
final int[] temp = mantMul(a, b, new int[8]);
final int s = (temp[0] >> 1);
final int exp = e0 + e1 - 16383 + s;
if(exp <= 0)
{
r[0] = r[1] = r[2] = r[3] = 0;
return r;
}
// TODO +/- inf
final int s0 = 16 - s;
final int s1 = 32 - s0;
r[0] = sign | (exp << 16) | (((temp[0] << s0) | (temp[1] >>> s1)) & 0xffff);
r[1] = (temp[1] << s0) | (temp[2] >>> s1);
r[2] = (temp[2] << s0) | (temp[3] >>> s1);
r[3] = (temp[3] << s0) | (temp[4] >>> s1);
return r;
}
private static int[] mantMul(int[] a, int[] b, int[] ret)
{
long t, carry, sum;
final int a0 = (a[0] & 0xffff) | 0x10000;
final int b0 = (b[0] & 0xffff) | 0x10000;
t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
carry = t >>> 32;
ret[7] = (int)t;
t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
carry += sum >>> 32;
ret[6] = (int)sum;
t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[5] = (int)sum;
t = a0 * (b[3] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[3] & 0xffffffffL) * b0;
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[4] = (int)sum;
t = a0 * (b[2] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
carry += t >>> 32;
sum += t & 0xffffffffL;
t = (a[2] & 0xffffffffL) * b0;
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[3] = (int)sum;
t = a0 * (b[1] & 0xffffffffL);
sum = (t & 0xffffffffL) + carry;
carry = t >>> 32;
t = (a[1] & 0xffffffffL) * b0;
sum += t & 0xffffffffL;
carry += (t >>> 32) + (sum >>> 32);
ret[2] = (int)sum;
t = (long)a0 * (long)b0;
sum = (t & 0xffffffffL) + carry;
ret[1] = (int)sum;
ret[0] = (int)((t >>> 32) + (sum >>> 32));
return ret;
}
public static int[] fromDouble(double v, int[] n)
{
long bits = Double.doubleToRawLongBits(v);
int sign = (int)((bits >> 63) & 1);
int exp = ((int)(bits >> 52) & 2047) - 1023;
int m0 = (int)(bits >> 36) & 0xffff;
int m1 = (int)(bits >> 4);
int m2 = (int)(bits & 15) << 28;
n[0] = (sign << 31) | ((exp + 16383) << 16) | m0;
n[1] = m1;
n[2] = m2;
n[3] = 0;
return n;
}
private static String mantissaToHex(int[] n)
{
return String.format(n[0] < 0 ? "-%x%08x%08x%08x" : "%x%08x%08x%08x", (n[0] & 0xffff) | 0x10000, n[1], n[2], n[3]);
}
public static BigDecimal toBigDecimal(int[] n)
{
final BigDecimal mant = new BigDecimal(new BigInteger(mantissaToHex(n), 16)).divide(MAN_DIV);
int exp = getExponent(n);
final BigDecimal em = exp < 0 ? BigDecimal.ONE.divide(BD_2.pow(-exp)) : BD_2.pow(exp);
return mant.multiply(em);
}
public static String toString(int[] n)
{
if((n[0] & 0x7fff0000) == 0)
return "0";
return toBigDecimal(n).toString();
}
public static String toHexString(int[] n)
{
StringBuilder sb = new StringBuilder();
for(int i = 0; i < n.length; i++)
sb.append(String.format(i == 0 ? "%08x" : "%08x", n[i]));
return sb.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment