ptomato/floatmath.mjs

## floatmath.mjs
// The following are line-by-line ports of the fma() and remquo()
// implementations from Openlibm, the math implementation of the Julia language.

// fma()
// Copyright (c) 2005-2011 David Schultz <das@FreeBSD.ORG>
//   2-clause BSD license.

// remquo()
// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
//   Developed at SunSoft, a Sun Microsystems, Inc. business. Permission to use,
//   copy, modify, and distribute this software is freely granted, provided that
//   this notice is preserved.

import copysign from 'math-float64-copysign';
import frexp from 'math-float64-frexp';
import u32f64 from 'math-float64-from-words';
import ldexp from 'math-float64-ldexp';
import f64u32 from 'math-float64-to-words';

const MathAbs = Math.abs;
const MathLog2 = Math.log2;
const MathPow = Math.pow;
const NumberIsFinite = Number.isFinite;
const NumberIsNaN = Number.isNaN;

// Various floating-point info constants

const MANT_DIG = 53;
const DBL_MIN = MathPow(2, -1022);

// The dd functions represent a floating-point number with twice the precision
// of a JS Number. We maintain the invariant that "hi" stores the 53 high-order
// bits of the result.

/*
 * Compute a + b exactly, returning the exact result in a dd record. We assume
 * that both a and b are finite, but make no assumptions about their relative
 * magnitudes.
 */
function ddAdd(a, b) {
  const hi = a + b;
  const s = hi - a;
  const lo = a - (hi - s) + (b - s);
  return { hi, lo };
}

/*
 * Compute a * b exactly, returning the exact result in a dd record. We assume
 * that both a and b are normalized, so no underflow or overflow will occur.
 * In C, the system's floating-point rounding mode must be FE_TONEAREST, but
 * that rounding mode is already mandated by the ECMAScript spec.
 */
const SPLIT = MathPow(2, 27) + 1;
function ddMul(a, b) {
  let p = a * SPLIT;
  let ha = a - p;
  ha += p;
  const la = a - ha;

  p = b * SPLIT;
  let hb = b - p;
  hb += p;
  const lb = b - hb;

  p = ha * hb;
  const q = ha * lb + la * hb;

  const hi = p + q;
  const lo = p - hi + q + la * lb;
  return { hi, lo };
}

/*
 * Compute a+b, with a small tweak: The least significant bit of the result is
 * adjusted into a sticky bit summarizing all the bits that were lost to
 * rounding. This adjustment negates the effects of double rounding when the
 * result is added to another number with a higher exponent. For an explanation
 * of round and sticky bits, see any reference on FPU design, e.g.,
 *
 *     J. Coonen.  An Implementation Guide to a Proposed Standard for
 *     Floating-Point Arithmetic.  Computer, vol. 13, no. 1, Jan 1980.
 */
function addAdjusted(a, b) {
  const sum = ddAdd(a, b);
  if (sum.lo !== 0) {
    let [hh, lh] = f64u32(sum.hi);
    if (lh & 1 === 0) {
      lh += copysign(1, sum.hi * sum.lo);
      hh += lh >>> 32;
      lh &= 0xffff_ffff;
      sum.hi = u32f64(hh, lh);
    }
  }
  return sum.hi;
}

/*
 * Compute ldexp(a + b, scale) with a single rounding error. It is assumed that
 * the result will be subnormal, and care is taken to ensure that double
 * rounding does not occur.
 */
function addAndDenormalize(a, b, scale) {
  const sum = ddAdd(a, b);

  // If we are losing at least two bits of accuracy to denormalization, then the
  // first lost bit becomes a round bit, and we adjust the lowest bit of sum.hi
  // to make it a sticky bit summarizing all the bits in sum.lo. With the sticky
  // bit adjusted, the hardware will break any ties in the correct direction.
  //
  // If we are losing only one bit to denormalization, however, we must break
  // the ties manually.
  if (sum.lo !== 0) {
    let [hh, lh] = f64u32(sum.hi);
    const bitsLost = -((hh >>> 20) & 0x7ff) - scale + 1;
    if ((bitsLost !== 1) ^ (lh & 1)) {
      lh += copysign(1, sum.hi * sum.lo);
      hh += lh >>> 32;
      lh &= 0xffff_ffff;
      sum.hi = u32f64(hh, lh);
    }
  }

  return ldexp(sum.hi, scale);
}

/*
 * Fused multiply-add: Compute x * y + z with a single rounding error.
 *
 * We use scaling to avoid overflow/underflow, along with the canonical
 * precision-doubling technique adapted from:
 *
 *  Dekker, T.  A Floating-Point Technique for Extending the Available
 *  Precision.  Numer. Math. 18, 224-242 (1971).
 *
 * https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_fma.c
 */
export function fma(x, y, z) {
  // Handle special cases. The order of operations and the particular return
  // values here are crucial in handling special cases involving infinities,
  // NaNs, overflows, and signed zeroes correctly.
  if (x === 0 || y === 0) return x * y + z;
  if (z === 0) return x * y;
  if (!NumberIsFinite(x) || !NumberIsFinite(y)) return x * y + z;
  if (!NumberIsFinite(z)) return z;

  const [xs, ex] = frexp(x);
  const [ys, ey] = frexp(y);
  let [zs, ez] = frexp(z);
  let spread = ex + ey - ez;

  // If x * y and z are many orders of magnitude apart, the scaling will
  // overflow, so we handle these cases specially.
  if (spread < -MANT_DIG) return z;
  if (spread <= MANT_DIG * 2) {
    zs = ldexp(zs, -spread);
  } else {
    zs = copysign(DBL_MIN, zs);
  }

  // Basic approach:
  //   (xy.hi, xy.lo) = x * y   (exact)
  //   (r.hi, r.lo) = xy.hi + z (exact)
  //   adj = xy.lo + r.lo       (inexact; low bit is sticky)
  //   result = r.hi + adj      (correctly rounded)
  const xy = ddMul(xs, ys);
  const r = ddAdd(xy.hi, zs);

  spread = ex + ey;

  // When the addends cancel to 0, ensure that the result has the correct sign.
  if (r.hi === 0) return xy.hi + zs + ldexp(xy.lo, spread);

  const adj = addAdjusted(r.lo, xy.lo);
  if (spread + ilogb(r.hi) > -1023) return ldexp(r.hi + adj, spread);
  return addAndDenormalize(r.hi, adj, spread);
}

// ilogb(x) is the same as Math.log2(Math.abs(x)), but with integer sentinel
// return values for incalculable inputs, instead of NaN
const MAXINT32 = 2 ** 31 - 1;
const FP_ILOGB0 = -MAXINT32;
const FP_ILOGBNAN = MAXINT32;
export function ilogb(x) {
  if (x === 0) return FP_ILOGB0;
  if (NumberIsNaN(x)) return FP_ILOGBNAN;
  if (!NumberIsFinite(x)) return MAXINT32;
  return MathLog2(MathAbs(x)) | 0;
}

/*
 * Return [rem, quo] with rem the IEEE remainder and quo the last n bits of the
 * quotient, rounded to the nearest integer. We choose n = 31 because we wind up
 * computing all the integer bits of the quotient anyway as a side-effect of
 * computing the remainder by the shift and subtract method. In practice, this
 * is far more bits than are needed to use remquo in reduction algorithms.
 *
 * https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_remquo.c
 */
export function remquo(x, y) {
  let [hx, lx] = f64u32(x);
  let [hy, ly] = f64u32(y);
  const sxy = (hx ^ hy) & 0x8000_0000;
  const sx = hx & 0x8000_0000; // sign of x
  hx ^= sx; // |x|
  hy &= 0x7fff_ffff; // |y|

  // Purge off exception values
  if (y === 0 || !NumberIsFinite(x) || NumberIsNaN(y)) return [(x * y) / (x * y), NaN];
  if (hx <= hy) {
    // |x|<|y| return x or x - y
    if (hx < hy || lx < ly) return remquoFixup(0, hx, lx, y, sx, sxy);
    // |x|=|y| return x * 0
    if (lx === ly) return [x * 0, 1];
  }

  const ix = ilogb(x);
  let iy = ilogb(y);

  // set up {hx,lx}, {hy,ly} and align y to x
  if (ix >= -1022) {
    hx = 0x0010_0000 | (0x000f_ffff & hx);
  } else {
    // subnormal x, shift x to normal
    const n = -1022 - ix;
    if (n <= 31) {
      hx = (hx << n) | (lx >> (32 - n));
      lx <<= n;
    } else {
      hx = lx << (n - 32);
      lx = 0;
    }
  }
  if (iy >= -1022) {
    hy = 0x0010_0000 | (0x000f_ffff & hy);
  } else {
    // subnormal y, shift y to normal
    const n = -1022 - iy;
    if (n <= 31) {
      hy = (hy << n) | (ly >> (32 - n));
      ly <<= n;
    } else {
      hy = ly << (n - 32);
      ly = 0;
    }
  }

  // fix point fmod
  let n = ix - iy;
  let q = 0;
  while (n--) {
    let hz = hx - hy;
    const lz = lx - ly;
    if (lx < ly) hz -= 1;
    if (hz < 0) {
      hx = hx + hx + (lx >>> 31);
      lx = lx + lx;
    } else {
      hx = hz + hz + (lz >>> 31);
      lx = lz + lz;
      q++;
    }
    q <<= 1;
  }
  let hz = hx - hy;
  const lz = lx - ly;
  if (lx < ly) hz -= 1;
  if (hz >= 0) {
    hx = hz;
    lx = lz;
    q++;
  }

  // convert back to floating value and restore the sign
  if ((hx | lx) === 0) {
    // return sign(x) * 0
    const quo = sxy ? -q : q;
    return [sx * 0, quo];
  }
  while (hx < 0x00100000) {
    // normalize x
    hx = hx + hx + (lx >>> 31);
    lx = lx + lx;
    iy -= 1;
  }
  if (iy >= -1022) {
    // normalize output
    hx = (hx - 0x00100000) | ((iy + 1023) << 20);
  } else {
    // subnormal output
    n = -1022 - iy;
    if (n <= 20) {
      lx = (lx >> n) | (hx << (32 - n));
      hx >>= n;
    } else if (n <= 31) {
      lx = (hx << (32 - n)) | (lx >>> n);
      hx = sx;
    } else {
      lx = hx >> (n - 32);
      hx = sx;
    }
  }
  return remquoFixup(q, hx, lx, y, sx, sxy);
}
const TWOM1021 = MathPow(2, -1021);
function remquoFixup(q, hx, lx, y, sx, sxy) {
  let x = u32f64(hx, lx);
  y = MathAbs(y);
  if (y < TWOM1021) {
    if (x + x > y || (x + x === y && q & 1)) {
      q++;
      x -= y;
    }
  } else if (x > 0.5 * y || (x === 0.5 * y && q & 1)) {
    q++;
    x -= y;
  }
  [hx, lx] = f64u32(x);
  x = u32f64(hx ^ sx, lx);
  q &= 0x7fff_ffff;
  const quo = sxy ? -q : q;
  return [x, quo];
}
	// The following are line-by-line ports of the fma() and remquo()
	// implementations from Openlibm, the math implementation of the Julia language.

	// fma()
	// Copyright (c) 2005-2011 David Schultz <das@FreeBSD.ORG>
	// 2-clause BSD license.

	// remquo()
	// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
	// Developed at SunSoft, a Sun Microsystems, Inc. business. Permission to use,
	// copy, modify, and distribute this software is freely granted, provided that
	// this notice is preserved.

	import copysign from 'math-float64-copysign';
	import frexp from 'math-float64-frexp';
	import u32f64 from 'math-float64-from-words';
	import ldexp from 'math-float64-ldexp';
	import f64u32 from 'math-float64-to-words';

	const MathAbs = Math.abs;
	const MathLog2 = Math.log2;
	const MathPow = Math.pow;
	const NumberIsFinite = Number.isFinite;
	const NumberIsNaN = Number.isNaN;

	// Various floating-point info constants

	const MANT_DIG = 53;
	const DBL_MIN = MathPow(2, -1022);

	// The dd functions represent a floating-point number with twice the precision
	// of a JS Number. We maintain the invariant that "hi" stores the 53 high-order
	// bits of the result.

	/*
	* Compute a + b exactly, returning the exact result in a dd record. We assume
	* that both a and b are finite, but make no assumptions about their relative
	* magnitudes.
	*/
	function ddAdd(a, b) {
	const hi = a + b;
	const s = hi - a;
	const lo = a - (hi - s) + (b - s);
	return { hi, lo };
	}

	/*
	* Compute a * b exactly, returning the exact result in a dd record. We assume
	* that both a and b are normalized, so no underflow or overflow will occur.
	* In C, the system's floating-point rounding mode must be FE_TONEAREST, but
	* that rounding mode is already mandated by the ECMAScript spec.
	*/
	const SPLIT = MathPow(2, 27) + 1;
	function ddMul(a, b) {
	let p = a * SPLIT;
	let ha = a - p;
	ha += p;
	const la = a - ha;

	p = b * SPLIT;
	let hb = b - p;
	hb += p;
	const lb = b - hb;

	p = ha * hb;
	const q = ha * lb + la * hb;

	const hi = p + q;
	const lo = p - hi + q + la * lb;
	return { hi, lo };
	}

	/*
	* Compute a+b, with a small tweak: The least significant bit of the result is
	* adjusted into a sticky bit summarizing all the bits that were lost to
	* rounding. This adjustment negates the effects of double rounding when the
	* result is added to another number with a higher exponent. For an explanation
	* of round and sticky bits, see any reference on FPU design, e.g.,
	*
	* J. Coonen. An Implementation Guide to a Proposed Standard for
	* Floating-Point Arithmetic. Computer, vol. 13, no. 1, Jan 1980.
	*/
	function addAdjusted(a, b) {
	const sum = ddAdd(a, b);
	if (sum.lo !== 0) {
	let [hh, lh] = f64u32(sum.hi);
	if (lh & 1 === 0) {
	lh += copysign(1, sum.hi * sum.lo);
	hh += lh >>> 32;
	lh &= 0xffff_ffff;
	sum.hi = u32f64(hh, lh);
	}
	}
	return sum.hi;
	}

	/*
	* Compute ldexp(a + b, scale) with a single rounding error. It is assumed that
	* the result will be subnormal, and care is taken to ensure that double
	* rounding does not occur.
	*/
	function addAndDenormalize(a, b, scale) {
	const sum = ddAdd(a, b);

	// If we are losing at least two bits of accuracy to denormalization, then the
	// first lost bit becomes a round bit, and we adjust the lowest bit of sum.hi
	// to make it a sticky bit summarizing all the bits in sum.lo. With the sticky
	// bit adjusted, the hardware will break any ties in the correct direction.
	//
	// If we are losing only one bit to denormalization, however, we must break
	// the ties manually.
	if (sum.lo !== 0) {
	let [hh, lh] = f64u32(sum.hi);
	const bitsLost = -((hh >>> 20) & 0x7ff) - scale + 1;
	if ((bitsLost !== 1) ^ (lh & 1)) {
	lh += copysign(1, sum.hi * sum.lo);
	hh += lh >>> 32;
	lh &= 0xffff_ffff;
	sum.hi = u32f64(hh, lh);
	}
	}

	return ldexp(sum.hi, scale);
	}

	/*
	* Fused multiply-add: Compute x * y + z with a single rounding error.
	*
	* We use scaling to avoid overflow/underflow, along with the canonical
	* precision-doubling technique adapted from:
	*
	* Dekker, T. A Floating-Point Technique for Extending the Available
	* Precision. Numer. Math. 18, 224-242 (1971).
	*
	* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_fma.c
	*/
	export function fma(x, y, z) {
	// Handle special cases. The order of operations and the particular return
	// values here are crucial in handling special cases involving infinities,
	// NaNs, overflows, and signed zeroes correctly.
	if (x === 0 \|\| y === 0) return x * y + z;
	if (z === 0) return x * y;
	if (!NumberIsFinite(x) \|\| !NumberIsFinite(y)) return x * y + z;
	if (!NumberIsFinite(z)) return z;

	const [xs, ex] = frexp(x);
	const [ys, ey] = frexp(y);
	let [zs, ez] = frexp(z);
	let spread = ex + ey - ez;

	// If x * y and z are many orders of magnitude apart, the scaling will
	// overflow, so we handle these cases specially.
	if (spread < -MANT_DIG) return z;
	if (spread <= MANT_DIG * 2) {
	zs = ldexp(zs, -spread);
	} else {
	zs = copysign(DBL_MIN, zs);
	}

	// Basic approach:
	// (xy.hi, xy.lo) = x * y (exact)
	// (r.hi, r.lo) = xy.hi + z (exact)
	// adj = xy.lo + r.lo (inexact; low bit is sticky)
	// result = r.hi + adj (correctly rounded)
	const xy = ddMul(xs, ys);
	const r = ddAdd(xy.hi, zs);

	spread = ex + ey;

	// When the addends cancel to 0, ensure that the result has the correct sign.
	if (r.hi === 0) return xy.hi + zs + ldexp(xy.lo, spread);

	const adj = addAdjusted(r.lo, xy.lo);
	if (spread + ilogb(r.hi) > -1023) return ldexp(r.hi + adj, spread);
	return addAndDenormalize(r.hi, adj, spread);
	}

	// ilogb(x) is the same as Math.log2(Math.abs(x)), but with integer sentinel
	// return values for incalculable inputs, instead of NaN
	const MAXINT32 = 2 ** 31 - 1;
	const FP_ILOGB0 = -MAXINT32;
	const FP_ILOGBNAN = MAXINT32;
	export function ilogb(x) {
	if (x === 0) return FP_ILOGB0;
	if (NumberIsNaN(x)) return FP_ILOGBNAN;
	if (!NumberIsFinite(x)) return MAXINT32;
	return MathLog2(MathAbs(x)) \| 0;
	}

	/*
	* Return [rem, quo] with rem the IEEE remainder and quo the last n bits of the
	* quotient, rounded to the nearest integer. We choose n = 31 because we wind up
	* computing all the integer bits of the quotient anyway as a side-effect of
	* computing the remainder by the shift and subtract method. In practice, this
	* is far more bits than are needed to use remquo in reduction algorithms.
	*
	* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_remquo.c
	*/
	export function remquo(x, y) {
	let [hx, lx] = f64u32(x);
	let [hy, ly] = f64u32(y);
	const sxy = (hx ^ hy) & 0x8000_0000;
	const sx = hx & 0x8000_0000; // sign of x
	hx ^= sx; // \|x\|
	hy &= 0x7fff_ffff; // \|y\|

	// Purge off exception values
	if (y === 0 \|\| !NumberIsFinite(x) \|\| NumberIsNaN(y)) return [(x * y) / (x * y), NaN];
	if (hx <= hy) {
	// \|x\|<\|y\| return x or x - y
	if (hx < hy \|\| lx < ly) return remquoFixup(0, hx, lx, y, sx, sxy);
	// \|x\|=\|y\| return x * 0
	if (lx === ly) return [x * 0, 1];
	}

	const ix = ilogb(x);
	let iy = ilogb(y);

	// set up {hx,lx}, {hy,ly} and align y to x
	if (ix >= -1022) {
	hx = 0x0010_0000 \| (0x000f_ffff & hx);
	} else {
	// subnormal x, shift x to normal
	const n = -1022 - ix;
	if (n <= 31) {
	hx = (hx << n) \| (lx >> (32 - n));
	lx <<= n;
	} else {
	hx = lx << (n - 32);
	lx = 0;
	}
	}
	if (iy >= -1022) {
	hy = 0x0010_0000 \| (0x000f_ffff & hy);
	} else {
	// subnormal y, shift y to normal
	const n = -1022 - iy;
	if (n <= 31) {
	hy = (hy << n) \| (ly >> (32 - n));
	ly <<= n;
	} else {
	hy = ly << (n - 32);
	ly = 0;
	}
	}

	// fix point fmod
	let n = ix - iy;
	let q = 0;
	while (n--) {
	let hz = hx - hy;
	const lz = lx - ly;
	if (lx < ly) hz -= 1;
	if (hz < 0) {
	hx = hx + hx + (lx >>> 31);
	lx = lx + lx;
	} else {
	hx = hz + hz + (lz >>> 31);
	lx = lz + lz;
	q++;
	}
	q <<= 1;
	}
	let hz = hx - hy;
	const lz = lx - ly;
	if (lx < ly) hz -= 1;
	if (hz >= 0) {
	hx = hz;
	lx = lz;
	q++;
	}

	// convert back to floating value and restore the sign
	if ((hx \| lx) === 0) {
	// return sign(x) * 0
	const quo = sxy ? -q : q;
	return [sx * 0, quo];
	}
	while (hx < 0x00100000) {
	// normalize x
	hx = hx + hx + (lx >>> 31);
	lx = lx + lx;
	iy -= 1;
	}
	if (iy >= -1022) {
	// normalize output
	hx = (hx - 0x00100000) \| ((iy + 1023) << 20);
	} else {
	// subnormal output
	n = -1022 - iy;
	if (n <= 20) {
	lx = (lx >> n) \| (hx << (32 - n));
	hx >>= n;
	} else if (n <= 31) {
	lx = (hx << (32 - n)) \| (lx >>> n);
	hx = sx;
	} else {
	lx = hx >> (n - 32);
	hx = sx;
	}
	}
	return remquoFixup(q, hx, lx, y, sx, sxy);
	}
	const TWOM1021 = MathPow(2, -1021);
	function remquoFixup(q, hx, lx, y, sx, sxy) {
	let x = u32f64(hx, lx);
	y = MathAbs(y);
	if (y < TWOM1021) {
	if (x + x > y \|\| (x + x === y && q & 1)) {
	q++;
	x -= y;
	}
	} else if (x > 0.5 * y \|\| (x === 0.5 * y && q & 1)) {
	q++;
	x -= y;
	}
	[hx, lx] = f64u32(x);
	x = u32f64(hx ^ sx, lx);
	q &= 0x7fff_ffff;
	const quo = sxy ? -q : q;
	return [x, quo];
	}