Skip to content

Instantly share code, notes, and snippets.

@ptomato
Created November 28, 2023 23:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ptomato/d1ac949bbd3e5719766e770b7c0415bb to your computer and use it in GitHub Desktop.
Save ptomato/d1ac949bbd3e5719766e770b7c0415bb to your computer and use it in GitHub Desktop.
fma() and extra-precise remquo() implemented using JS Numbers
// The following are line-by-line ports of the fma() and remquo()
// implementations from Openlibm, the math implementation of the Julia language.
// fma()
// Copyright (c) 2005-2011 David Schultz <das@FreeBSD.ORG>
// 2-clause BSD license.
// remquo()
// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
// Developed at SunSoft, a Sun Microsystems, Inc. business. Permission to use,
// copy, modify, and distribute this software is freely granted, provided that
// this notice is preserved.
import copysign from 'math-float64-copysign';
import frexp from 'math-float64-frexp';
import u32f64 from 'math-float64-from-words';
import ldexp from 'math-float64-ldexp';
import f64u32 from 'math-float64-to-words';
const MathAbs = Math.abs;
const MathLog2 = Math.log2;
const MathPow = Math.pow;
const NumberIsFinite = Number.isFinite;
const NumberIsNaN = Number.isNaN;
// Various floating-point info constants
const MANT_DIG = 53;
const DBL_MIN = MathPow(2, -1022);
// The dd functions represent a floating-point number with twice the precision
// of a JS Number. We maintain the invariant that "hi" stores the 53 high-order
// bits of the result.
/*
* Compute a + b exactly, returning the exact result in a dd record. We assume
* that both a and b are finite, but make no assumptions about their relative
* magnitudes.
*/
function ddAdd(a, b) {
const hi = a + b;
const s = hi - a;
const lo = a - (hi - s) + (b - s);
return { hi, lo };
}
/*
* Compute a * b exactly, returning the exact result in a dd record. We assume
* that both a and b are normalized, so no underflow or overflow will occur.
* In C, the system's floating-point rounding mode must be FE_TONEAREST, but
* that rounding mode is already mandated by the ECMAScript spec.
*/
const SPLIT = MathPow(2, 27) + 1;
function ddMul(a, b) {
let p = a * SPLIT;
let ha = a - p;
ha += p;
const la = a - ha;
p = b * SPLIT;
let hb = b - p;
hb += p;
const lb = b - hb;
p = ha * hb;
const q = ha * lb + la * hb;
const hi = p + q;
const lo = p - hi + q + la * lb;
return { hi, lo };
}
/*
* Compute a+b, with a small tweak: The least significant bit of the result is
* adjusted into a sticky bit summarizing all the bits that were lost to
* rounding. This adjustment negates the effects of double rounding when the
* result is added to another number with a higher exponent. For an explanation
* of round and sticky bits, see any reference on FPU design, e.g.,
*
* J. Coonen. An Implementation Guide to a Proposed Standard for
* Floating-Point Arithmetic. Computer, vol. 13, no. 1, Jan 1980.
*/
function addAdjusted(a, b) {
const sum = ddAdd(a, b);
if (sum.lo !== 0) {
let [hh, lh] = f64u32(sum.hi);
if (lh & 1 === 0) {
lh += copysign(1, sum.hi * sum.lo);
hh += lh >>> 32;
lh &= 0xffff_ffff;
sum.hi = u32f64(hh, lh);
}
}
return sum.hi;
}
/*
* Compute ldexp(a + b, scale) with a single rounding error. It is assumed that
* the result will be subnormal, and care is taken to ensure that double
* rounding does not occur.
*/
function addAndDenormalize(a, b, scale) {
const sum = ddAdd(a, b);
// If we are losing at least two bits of accuracy to denormalization, then the
// first lost bit becomes a round bit, and we adjust the lowest bit of sum.hi
// to make it a sticky bit summarizing all the bits in sum.lo. With the sticky
// bit adjusted, the hardware will break any ties in the correct direction.
//
// If we are losing only one bit to denormalization, however, we must break
// the ties manually.
if (sum.lo !== 0) {
let [hh, lh] = f64u32(sum.hi);
const bitsLost = -((hh >>> 20) & 0x7ff) - scale + 1;
if ((bitsLost !== 1) ^ (lh & 1)) {
lh += copysign(1, sum.hi * sum.lo);
hh += lh >>> 32;
lh &= 0xffff_ffff;
sum.hi = u32f64(hh, lh);
}
}
return ldexp(sum.hi, scale);
}
/*
* Fused multiply-add: Compute x * y + z with a single rounding error.
*
* We use scaling to avoid overflow/underflow, along with the canonical
* precision-doubling technique adapted from:
*
* Dekker, T. A Floating-Point Technique for Extending the Available
* Precision. Numer. Math. 18, 224-242 (1971).
*
* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_fma.c
*/
export function fma(x, y, z) {
// Handle special cases. The order of operations and the particular return
// values here are crucial in handling special cases involving infinities,
// NaNs, overflows, and signed zeroes correctly.
if (x === 0 || y === 0) return x * y + z;
if (z === 0) return x * y;
if (!NumberIsFinite(x) || !NumberIsFinite(y)) return x * y + z;
if (!NumberIsFinite(z)) return z;
const [xs, ex] = frexp(x);
const [ys, ey] = frexp(y);
let [zs, ez] = frexp(z);
let spread = ex + ey - ez;
// If x * y and z are many orders of magnitude apart, the scaling will
// overflow, so we handle these cases specially.
if (spread < -MANT_DIG) return z;
if (spread <= MANT_DIG * 2) {
zs = ldexp(zs, -spread);
} else {
zs = copysign(DBL_MIN, zs);
}
// Basic approach:
// (xy.hi, xy.lo) = x * y (exact)
// (r.hi, r.lo) = xy.hi + z (exact)
// adj = xy.lo + r.lo (inexact; low bit is sticky)
// result = r.hi + adj (correctly rounded)
const xy = ddMul(xs, ys);
const r = ddAdd(xy.hi, zs);
spread = ex + ey;
// When the addends cancel to 0, ensure that the result has the correct sign.
if (r.hi === 0) return xy.hi + zs + ldexp(xy.lo, spread);
const adj = addAdjusted(r.lo, xy.lo);
if (spread + ilogb(r.hi) > -1023) return ldexp(r.hi + adj, spread);
return addAndDenormalize(r.hi, adj, spread);
}
// ilogb(x) is the same as Math.log2(Math.abs(x)), but with integer sentinel
// return values for incalculable inputs, instead of NaN
const MAXINT32 = 2 ** 31 - 1;
const FP_ILOGB0 = -MAXINT32;
const FP_ILOGBNAN = MAXINT32;
export function ilogb(x) {
if (x === 0) return FP_ILOGB0;
if (NumberIsNaN(x)) return FP_ILOGBNAN;
if (!NumberIsFinite(x)) return MAXINT32;
return MathLog2(MathAbs(x)) | 0;
}
/*
* Return [rem, quo] with rem the IEEE remainder and quo the last n bits of the
* quotient, rounded to the nearest integer. We choose n = 31 because we wind up
* computing all the integer bits of the quotient anyway as a side-effect of
* computing the remainder by the shift and subtract method. In practice, this
* is far more bits than are needed to use remquo in reduction algorithms.
*
* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_remquo.c
*/
export function remquo(x, y) {
let [hx, lx] = f64u32(x);
let [hy, ly] = f64u32(y);
const sxy = (hx ^ hy) & 0x8000_0000;
const sx = hx & 0x8000_0000; // sign of x
hx ^= sx; // |x|
hy &= 0x7fff_ffff; // |y|
// Purge off exception values
if (y === 0 || !NumberIsFinite(x) || NumberIsNaN(y)) return [(x * y) / (x * y), NaN];
if (hx <= hy) {
// |x|<|y| return x or x - y
if (hx < hy || lx < ly) return remquoFixup(0, hx, lx, y, sx, sxy);
// |x|=|y| return x * 0
if (lx === ly) return [x * 0, 1];
}
const ix = ilogb(x);
let iy = ilogb(y);
// set up {hx,lx}, {hy,ly} and align y to x
if (ix >= -1022) {
hx = 0x0010_0000 | (0x000f_ffff & hx);
} else {
// subnormal x, shift x to normal
const n = -1022 - ix;
if (n <= 31) {
hx = (hx << n) | (lx >> (32 - n));
lx <<= n;
} else {
hx = lx << (n - 32);
lx = 0;
}
}
if (iy >= -1022) {
hy = 0x0010_0000 | (0x000f_ffff & hy);
} else {
// subnormal y, shift y to normal
const n = -1022 - iy;
if (n <= 31) {
hy = (hy << n) | (ly >> (32 - n));
ly <<= n;
} else {
hy = ly << (n - 32);
ly = 0;
}
}
// fix point fmod
let n = ix - iy;
let q = 0;
while (n--) {
let hz = hx - hy;
const lz = lx - ly;
if (lx < ly) hz -= 1;
if (hz < 0) {
hx = hx + hx + (lx >>> 31);
lx = lx + lx;
} else {
hx = hz + hz + (lz >>> 31);
lx = lz + lz;
q++;
}
q <<= 1;
}
let hz = hx - hy;
const lz = lx - ly;
if (lx < ly) hz -= 1;
if (hz >= 0) {
hx = hz;
lx = lz;
q++;
}
// convert back to floating value and restore the sign
if ((hx | lx) === 0) {
// return sign(x) * 0
const quo = sxy ? -q : q;
return [sx * 0, quo];
}
while (hx < 0x00100000) {
// normalize x
hx = hx + hx + (lx >>> 31);
lx = lx + lx;
iy -= 1;
}
if (iy >= -1022) {
// normalize output
hx = (hx - 0x00100000) | ((iy + 1023) << 20);
} else {
// subnormal output
n = -1022 - iy;
if (n <= 20) {
lx = (lx >> n) | (hx << (32 - n));
hx >>= n;
} else if (n <= 31) {
lx = (hx << (32 - n)) | (lx >>> n);
hx = sx;
} else {
lx = hx >> (n - 32);
hx = sx;
}
}
return remquoFixup(q, hx, lx, y, sx, sxy);
}
const TWOM1021 = MathPow(2, -1021);
function remquoFixup(q, hx, lx, y, sx, sxy) {
let x = u32f64(hx, lx);
y = MathAbs(y);
if (y < TWOM1021) {
if (x + x > y || (x + x === y && q & 1)) {
q++;
x -= y;
}
} else if (x > 0.5 * y || (x === 0.5 * y && q & 1)) {
q++;
x -= y;
}
[hx, lx] = f64u32(x);
x = u32f64(hx ^ sx, lx);
q &= 0x7fff_ffff;
const quo = sxy ? -q : q;
return [x, quo];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment