Skip to content

Instantly share code, notes, and snippets.

@simonbyrne
Last active November 18, 2023 02:39
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save simonbyrne/d640ac1c1db3e1774cf2d405049beef3 to your computer and use it in GitHub Desktop.
Payne-Hanek reduction in Julia
import Base: TwicePrecision, significand_bits, significand_mask, exponent_mask, exponent_bias
# Bits of 1/2π
# 1/2π == sum(x / 0x1p64^i for i,x = enumerate(INV2PI))
# Can be obtained by:
#
# setprecision(BigFloat, 4096)
# I = 0.5/big(pi)
# for i = 1:19
# I *= 0x1p64
# k = trunc(UInt64, I)
# @printf "0x%016x,\n" k
# I -= k
# end
#
const INV2PI = UInt64[
0x28be_60db_9391_054a,
0x7f09_d5f4_7d4d_3770,
0x36d8_a566_4f10_e410,
0x7f94_58ea_f7ae_f158,
0x6dc9_1b8e_9093_74b8,
0x0192_4bba_8274_6487,
0x3f87_7ac7_2c4a_69cf,
0xba20_8d7d_4bae_d121,
0x3a67_1c09_ad17_df90,
0x4e64_758e_60d4_ce7d,
0x2721_17e2_ef7e_4a0e,
0xc7fe_25ff_f781_6603,
0xfbcb_c462_d682_9b47,
0xdb4d_9fb3_c9f2_c26d,
0xd3d1_8fd9_a797_fa8b,
0x5d49_eeb1_faf9_7c5e,
0xcf41_ce7d_e294_a4ba,
0x9afe_d7ec_47e3_5742,
0x1580_cc11_bf1e_daea]
"""
paynehanek(x::Float64)::Tuple{Int, TwicePrecision{Float64}}
Performs Payne-Hanek reduction: computes the remainder of `x` modulo π/2 for values of `x` where
2.0^53 ≤ abs(x) < Inf
Returns a tuple `(q,y)`:
- `q` is an integer taking values 0,1,2 or 3, corresponding to the last 4 bits of the integer
div(x, π/2, RoundNearest) % 4
(computed without intermediate rounding),
- `y` is the remainder stored as a `TwicePrecision{Float64}` value, corresponding to
rem(x, π/2, RoundNearest)
that is, with `abs(y) ≤ π/4`.
"""
function paynehanek(x::Float64)
# 1. Convert to form
#
# x = X * 2^k,
#
# where 2^(n-1) <= X < 2^n is an n-bit integer (n = 53, k = exponent(x)-52 )
u = reinterpret(UInt64, x)
X = (u & significand_mask(Float64)) | (one(UInt64) << significand_bits(Float64))
k = Int((u & exponent_mask(Float64)) >> significand_bits(Float64)) - exponent_bias(Float64) - significand_bits(Float64)
# 2. Let α = 1/2π, then:
#
# α*x mod 1 ≡ [(α*2^k mod 1)*X] mod 1
#
# so we can ignore the first k bits of α. Extract the next 3 64-bit parts of α.
#
# i.e. equivalent to
# setprecision(BigFloat,4096)
# α = 1/(2*big(pi))
# A = mod(ldexp(α,k), 1)
# z1 = ldexp(A,64)
# a1 = trunc(UInt64, z1)
# z2 = ldexp(z1-a1, 64)
# a2 = trunc(UInt64, z2)
# z3 = ldexp(z2-a2, 64)
# a3 = trunc(UInt64, z3)
idx = k >> 6
shift = k - (idx << 6)
if shift == 0
a1 = INV2PI[idx+1]
a2 = INV2PI[idx+2]
a3 = INV2PI[idx+3]
else
a1 = (INV2PI[idx+1] << shift) | (INV2PI[idx+2] >> (64 - shift))
a2 = (INV2PI[idx+2] << shift) | (INV2PI[idx+3] >> (64 - shift))
a3 = (INV2PI[idx+3] << shift) | (INV2PI[idx+4] >> (64 - shift))
end
# 3. Perform the multiplication:
#
# X. 0 0 0
# × 0. a1 a2 a3
# ==============
# _. w w _
#
# (i.e. ignoring integer and lowest bit parts of result)
w1 = UInt128(X*a1) << 64 # overflow becomes integer
w2 = widemul(X,a2)
w3 = widemul(X,a3) >> 64
w = w1 + w2 + w3 # quotient fraction after division by 2π
# adjust for sign of x
w = flipsign(w,x)
# 4. convert to quadrant, quotient fraction after division by π/2:
q = (((w>>125)%Int +1)>>1) # nearest quadrant
f = (w<<2) % Int128 # fraction part of quotient after division by π/2, taking values on [-0.5,0.5)
# 5. convert quotient fraction to split precision Float64
z_hi,z_lo = fromfraction(f)
# 6. multiply by π/2
pio2_hi = 1.5707963407039642
pio2_lo = -1.3909067614167116e-8
y_hi = (z_hi+z_lo)*(pio2_hi+pio2_lo)
y_lo = (((z_hi*pio2_hi - y_hi) + z_hi*pio2_lo) + z_lo*pio2_hi) + z_lo*pio2_lo
return q, TwicePrecision(y_hi, y_lo)
end
"""
fromfraction(f::Int128)
Computes a tuple of values `(y1,y2)` such that
y1 + y2 == f / 2^128
and the significand of `y1` has 27 trailing zeros.
"""
function fromfraction(f::Int128)
if f == 0
return (0.0,0.0)
end
# 1. get leading term truncated to 26 bits
s = ((f < 0) % UInt64) << 63 # sign bit
x = abs(f) % UInt128 # magnitude
n1 = 128-leading_zeros(x) # ndigits0z(x,2)
m1 = ((x >> (n1-26)) % UInt64) << 27
d1 = ((n1-128+1021) % UInt64) << 52
z1 = reinterpret(Float64, s | (d1 + m1))
# 2. compute remaining term
x2 = (x - (UInt128(m1) << (n1-53)))
if x2 == 0
return (z1, 0.0)
end
n2 = 128-leading_zeros(x2)
m2 = (x2 >> (n2-53)) % UInt64
d2 = ((n2-128+1021) % UInt64) << 52
z2 = reinterpret(Float64, s | (d2 + m2))
return (z1,z2)
end
# Additional notes:
# - worst case (Muller et al, 2005, p 183) is x = 6381956970095103.0 * 2.0^797
# for which abs(rem(x,π/2, RoundNearest)) ≥ 2^-60.9
# - error of w is ~ 2^-128
# - error of f is ~ 2^-126
# - error of z is ~ |z|*2^-(53+26) + 2^-126
# - error of y is ~ 3*|y|*2^-(53+26) + 1.6*2^-126
# (TODO: check these)
# Total error ~ 2^-12 ulps
a = setprecision(BigFloat, 4096) do
x = 6381956970095103.0 * 2.0^797
rem(big(x), big(pi)/2, RoundNearest)
end
q,y = paynehanek(x)
diff = Float64(abs((a - y.hi) - y.lo))/eps(Float64(a))
log2(diff)
@shibatch
Copy link

Hello,
I also implemented Payne-Hanek reduction in my project. shibatch/sleef#197
I have a question on your implementation.
I believe that Payne and Henek's method is also implemented in fdlibm.
Did you compare your method with that implementation in terms of speed and accuracy?
There is no explanation as to how good your implementation is.

Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment