Skip to content

Instantly share code, notes, and snippets.

@jiahao
Created October 18, 2013 21:13
Show Gist options
  • Save jiahao/7048359 to your computer and use it in GitHub Desktop.
Save jiahao/7048359 to your computer and use it in GitHub Desktop.
Hypergeometric random variate generators implemented in Julia
# Hypergeometric random variate generators
#
# (c) 2013 Jiahao Chen <jiahao@mit.edu>
#
# This file implements several random variate generators as described in the
# reference paper.
#
# Algorithm Function
# HBU randhg_hbu
# HIN randhg_hin
# HALIAS randhg_alias
# H2PE randhg_h2pe
#
# Reference:
# "Computer generation of hypergeometric random variates",
# V. Kachitvichyanukul and B. Schmeiser, J. Stat. Comput. Simul. 22 (2),
# 1985, 127-145, doi:10.1080/00949658508810839
using Distributions
#################
# Algorithm HBU #
#################
#Naive generation using sampling without replacement
function randhg_hbu_core(n1::Int, n2::Int, k::Int)
n = n1 + n2
T, T1 = n, n1
x = 0
for J=0:min(k, n-k)
u = rand()
if u <= (T1/T)
x += 1
if x==n1 return x end
T1 -= 1
end
T -= 1
end
x
end
#For large k, swap the arguments around using symmetry of the hypergeometric
#distribution
function randhg_hbu(n1::Int, n2::Int, k::Int)
2k > (n1 + n2) ? n1 - randhg_hbu_core(n1, n2, n1+n2-k) : randhg_hbu_core(n1, n2, k)
end
#################
# Algorithm HIN #
#################
#Inverse transformation
function randhg_hin_core(n1::Int, n2::Int, k::Int)
n = n1+n2
if k<n2
#p = factorial(n2)*factorial(n-k)/(factorial(n)*factorial(n2-k))
p = exp(lfact(n2)+lfact(n-k)-lfact(n)-lfact(n2-k))
x = 0
else
#p = factorial(n1)*factorial(k)/(factorial(n)*factorial(k-n2))
p = exp(lfact(n1)+lfact(k)-lfact(n)-lfact(k-n2))
x = k-n2
end
u = rand()
while u > p #Typo in paper
u -= p
p *= (n1-x)*(k-x)/((x+1)*(n2-k+1+x))
x += 1
if x == n error("Inverse transformation did not converge") end
end
x
end
function randhg_hin(n1::Int, n2::Int, k::Int)
if n1 <= n2
if 2k <= (n1 + n2)
return randhg_hin_core(n1, n2, k)
else
return n1-randhg_hin_core(n1, n2, n1+n2-k)
end
else
if 2k <= (n1 + n2)
return k-randhg_hin_core(n2, n1, k)
else
return k-n2+randhg_hin_core(n2, n1, n1+n2-k)
end
end
end
####################
# Algorithm HALIAS #
####################
# The alias method
#XXX This doesn't work
function randhg_halias(n1::Int, n2::Int, k::Int)
#Step 1
IL = max(0, k-n2)
IU = min(k, n2)
IL, IU = min(IL, IU), max(IL, IU)
m = IU-IL+1
#Step 2. Calculate the hypergeometric probabilities for all mass points I_L...I_U
probabilities = pmf(HyperGeometric(n1, n2, k), IL:IU) #This really should be lowercase G
#Step 3. Setup the alias table A and table of its cutoff values f
A, F, _ = arbran(probabilities)
#To generate one hypergeometric random variate:
#Step 4.
u = rand() * m
i = int(ceil(u))
#Step 5.
x = (u>F[i]) ? A[i] : i
#Step 6.
x+IL-1
end
# Compute the aliases and cutoff values for the desired probability distribution.
# Input:
# e: The desired probability values
# n: number of variables
# Output:
# a: alias values
# f: cutoff values
# p: reconstructed probabilities
# Citation:
# "An Efficient Method for Generating Discrete Random Variables with General
# Distributions"
# A. J. Walker, ACM Trans. Math. Software 3 (3), 1977, 253-256
# doi:10.1145/355744.355749
function arbran{X<:Real}(e::Vector{X})
n = length(e)
error = .1e-5
#Initialize output
a = [1:n]
f = zeros(n)
b = e - 1/n
#Find largest positive and negative differences and their positions in b
c, k= minimum(b), indmin(b)
d, l= maximum(b), indmax(b)
#Test whether the sum of differences in b have become significant
if sum(abs(b)) >= error
#Assign alias and cutoff values
a[k], f[k], b[k], b[l] = l, 1+c*n, 0, c+d
end
#Computation of alias and cutoff values complete
#Now reconstruct the probabilities
p = f/n
for i=1:n, j=1:n
if (a[j]==i) p[i] += (1-f[j])/n end
end
a, f, p
end
##################
# Algorithm H2PE #
##################
# H2PE (Hypergeometric 2-Point Exponential) generator
#
# The new algorithm developed in Kachitvichyanukul and Schmeiser (1981).
#
# This takes an optional argument do_approx which controls whether or not to
# use the one-sided approximations in calculating the rescaled hypergeometric
# probability mass function, f. By default, do_approx=true, and is a faithful
# implementation of the H2PE algorithm; do_approx=false uses the exact method
# always.
function randhg_h2pe(n1✽::Int, n2✽::Int, k✽::Int, do_approx::Bool)
USE_HIN_THRESH = 10
#Step 0. Set up constants as function of n1✽, n2✽, k✽
#0.0. Determine parameter set for more efficient generation
n = n1✽ + n2✽
n1, n2 = n1✽>n2✽ ? (n2✽, n1✽) : (n1✽, n2✽)
k = 2k✽<=n ? k✽ : n-k✽
#0.1 Set up constants
M = floor((k+1)*(n1+1)/(n+2))
if (M-max(0,k-n2))<USE_HIN_THRESH return randhg_hin(n1✽, n2✽, k✽) end
A = lfact(M) + lfact(n1-M) + lfact(k-M) + lfact(n2-k+M)
D = 1.5*sqrt((n-k)*k*n1*n2/((n-1)*n*n)) + 0.5
xL, xR = M-D+0.5, M+D+0.5
kL = exp(A-lfact(xL)-lfact(n1-xL)-lfact(k-xL)-lfact(n2-k+xL))
#The paper has a sign error in the last term of kR
kR = exp(A-lfact(xR-1)-lfact(n1-xR+1)-lfact(k-xR+1)-lfact(n2-k+xR-1))
λL = -log(xL*(n2-k+xL)) + log((n1-xL+1)*(k-xL+1))
λR = -log((n1-xR+1)*(k-xR+1)) + log(xR*(n2-k+xR))
p1 = 2D
p2 = p1 + kL/λL #Typo in paper
p3 = p2 + kR/λR
y = 0 #Need to define outside to obey Julia scoping rules
#Step 1. Begin logic to generate one hypergeometric variate y.
# Generate u~U(0,p3) for selecting the region,
# v~U(0,1) for the accept/reject decision.
# If region 1 is selected, generate a uniform variate between xL and xR.
while true #Keep sampling until we find an acceptable variate
u, v = rand()*p3, rand()
if u > p1
#Step 2. Region 2, left exponential tail
if u <= p2
y = int(floor(xL + log(v)/λL))
if y < max(0, k-n2) continue end
v *= (u-p1)*λL
else
#Step 3. Region 3. right exponential tail
y = int(floor(xR - log(v)/λR))
if y > min(n1, k) continue end
v *= (u-p2)*λR
end
else
y = int(floor(xL + u))
end
#Step 4. Acceptance/rejection comparison
#4.0 test for appropriate method of evaluating f(y)
if !(do_approx && (M >=100 && y > 50)) #Use exact method
#4.1 Evaluate f(y) recursively via
#\[
# f(y+1) = f(y) \frac{(n1-y)(k-y)}{(y+1)(n2-k+y+1}
#\]
#Start the search from the mode
f = 1.0
if M < y
for i = M:y
f *= (n1-i+1)*(k-i+1)/(i*(n2-k+i))
end
else
if M > y
for i=y:M
f *= i*(n2-k+i)/((n1-i)*(k-i))
end
end
end
if v>f continue else break end
else #Use approximate method
#4.2 Squeezing. Check the value of ln(v) against upper and lower bound of
#ln(f).
x, y1, yM = y, y+1, y-M
yn, yk, nk = n1-y+1, k-y+1, n2-k+y1
R, S, T = -yM/y1, yM/yn, yM/yk
E, G = -yM/nk, yn * yk / (y1 * nk) - 1
DG = G>=0 ? 1 : 1 + G
GU = G*(1 + G*(-0.5+G/3.0))
GL = GU - G^4/(4DG)
XM, Xn, Xk = M+0.5, n1-M+0.5, k-M+0.5
nM = n2-k+XM
Ub = XM*R*(1+R*(-0.5+R/3.0))
+ Xn*S*(1+S*(-0.5+S/3.0))
+ Xk*T*(1+T*(-0.5+T/3.0))
+ nM*E*(1+E*(-0.5+E/3.0)) + y*GU - M*GL + 0.0034
Av = log(v)
if Av > Ub continue end
DR = XM * R^4
if R < 0 DR /= 1+R end
DS = Xn * S^4
if S < 0 DS /= 1+S end
DT = Xk * T^4
if T < 0 DT /= 1+T end
DE = nM * E^4
if E < 0 DE /= 1+E end
if Av < Ub-0.25*(DR + DS + DT + DE) + (y+M)*(GL-GU)-0.0078 break end
#4.3 Final acceptance/rejection test
if Av <= A-lfact(y)-lfact(n1-y)-lfact(k-y)-lfact(n2-k+y) break end
end #one-sided approximations
end #keep sampling until break condition
#Step 5. Return appropriate random variate
if 2k✽ < n #5.1
return n1✽<=n2✽ ? y : k✽-y
else #5.2
return n1✽<=n2✽ ? n1✽-y : k✽-n2✽-y
end
end
#The default is to do the approximation
randhg_h2pe(n1::Int, n2::Int, k::Int)=randhg_h2pe(n1, n2, k::Int, true)
#######################################################
# A very simple test driver to compare the algorithms #
#######################################################
NN = 100000 #Number of trials
for (n1, n2, k) in [(20, 20, 20), (100, 100, 20), (1000, 1000, 20), (10000,
100, 1000), (100, 100000, 10000), (1000, 10000, 1000), (10000, 1000, 1000),
(1000, 100000, 10000), (10000, 100, 10000)] #The parameters used in Table 1
@printf("\nGenerating %d random hypergeometric variates with n1=%d, n2=%d, k=%d\n",
NN, n1, n2, k)
@printf("Algorithm mean \tstd. dev.\t runtime\n")
x1 = @elapsed z1 = [randhg_hbu(n1, n2, k) for t=1:NN]
@printf("HBU\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z1), std(z1), x1)
x2 = @elapsed z2 = [randhg_hin(n1, n2, k) for t=1:NN]
@printf("HIN\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z2), std(z2), x2)
#x3 = @elapsed z3 = [randhg_halias(n1, n2, k) for t=1:NN]
#@printf("HALIAS\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
x3 = @elapsed z3 = [randhg_h2pe(n1, n2, k) for t=1:NN]
@printf("H2PE\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
#Exact variant of H2PE
x4 = @elapsed z4 = [randhg_h2pe(n1, n2, k, false) for t=1:NN]
@printf("H2PEx\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z4), std(z4), x4)
#The rng in Distributions
Z = HyperGeometric(n1, n2, k)
x5 = @elapsed z5 = [rand(Z) for t=1:NN]
@printf("Julia\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z5), std(z5), x5)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment