jiahao/Hypergeometric.jl

## Hypergeometric.jl
# Hypergeometric random variate generators
#
# (c) 2013 Jiahao Chen <jiahao@mit.edu>
#
# This file implements several random variate generators as described in the
# reference paper.
#
# Algorithm   Function
# HBU         randhg_hbu
# HIN         randhg_hin
# HALIAS      randhg_alias
# H2PE        randhg_h2pe
#
# Reference:
#   "Computer generation of hypergeometric random variates",
#   V. Kachitvichyanukul and B. Schmeiser, J. Stat. Comput. Simul. 22 (2),
#   1985, 127-145, doi:10.1080/00949658508810839

using Distributions

#################
# Algorithm HBU #
#################

#Naive generation using sampling without replacement
function randhg_hbu_core(n1::Int, n2::Int, k::Int)
  n = n1 + n2
  T, T1 = n, n1
  x = 0
  for J=0:min(k, n-k)
    u = rand()
    if u <= (T1/T)
      x += 1
      if x==n1 return x end
      T1 -= 1
    end
    T -= 1
  end
  x
end

#For large k, swap the arguments around using symmetry of the hypergeometric
#distribution
function randhg_hbu(n1::Int, n2::Int, k::Int)
  2k > (n1 + n2) ? n1 - randhg_hbu_core(n1, n2, n1+n2-k) : randhg_hbu_core(n1, n2, k)
end

#################
# Algorithm HIN #
#################

#Inverse transformation
function randhg_hin_core(n1::Int, n2::Int, k::Int)
  n = n1+n2
  if k<n2
    #p = factorial(n2)*factorial(n-k)/(factorial(n)*factorial(n2-k))
    p = exp(lfact(n2)+lfact(n-k)-lfact(n)-lfact(n2-k))
    x = 0
  else
    #p = factorial(n1)*factorial(k)/(factorial(n)*factorial(k-n2))
    p = exp(lfact(n1)+lfact(k)-lfact(n)-lfact(k-n2))
    x = k-n2
  end
  u = rand()
  while u > p #Typo in paper
    u -= p
    p *= (n1-x)*(k-x)/((x+1)*(n2-k+1+x))
    x += 1
    if x == n error("Inverse transformation did not converge") end
  end
  x
end

function randhg_hin(n1::Int, n2::Int, k::Int)
  if n1 <= n2
    if 2k <= (n1 + n2)
      return randhg_hin_core(n1, n2, k)
    else
      return n1-randhg_hin_core(n1, n2, n1+n2-k)
    end
  else
    if 2k <= (n1 + n2)
      return k-randhg_hin_core(n2, n1, k)
    else
      return k-n2+randhg_hin_core(n2, n1, n1+n2-k)
    end
  end
end

####################
# Algorithm HALIAS #
####################

# The alias method
#XXX This doesn't work
function randhg_halias(n1::Int, n2::Int, k::Int)
  #Step 1
  IL = max(0, k-n2)
  IU = min(k, n2)
  IL, IU = min(IL, IU), max(IL, IU)
  m = IU-IL+1
  #Step 2. Calculate the hypergeometric probabilities for all mass points I_L...I_U
  probabilities = pmf(HyperGeometric(n1, n2, k), IL:IU) #This really should be lowercase G
  #Step 3. Setup the alias table A and table of its cutoff values f
  A, F, _ = arbran(probabilities)
  #To generate one hypergeometric random variate:
  #Step 4.
  u = rand() * m
  i = int(ceil(u))
  #Step 5.
  x = (u>F[i]) ? A[i] : i
  #Step 6.
  x+IL-1
end

# Compute the aliases and cutoff values for the desired probability distribution.
# Input:
#   e: The desired probability values
#   n: number of variables
# Output:
#   a: alias values
#   f: cutoff values
#   p: reconstructed probabilities
# Citation:
#   "An Efficient Method for Generating Discrete Random Variables with General
#   Distributions"
#   A. J. Walker, ACM Trans. Math. Software 3 (3), 1977, 253-256
#   doi:10.1145/355744.355749
function arbran{X<:Real}(e::Vector{X})
  n = length(e)
  error = .1e-5
  #Initialize output
  a = [1:n]
  f = zeros(n)
  b = e - 1/n
  #Find largest positive and negative differences and their positions in b
  c, k= minimum(b), indmin(b)
  d, l= maximum(b), indmax(b)
  #Test whether the sum of differences in b have become significant
  if sum(abs(b)) >= error
    #Assign alias and cutoff values
    a[k], f[k], b[k], b[l] = l, 1+c*n, 0, c+d
  end
  #Computation of alias and cutoff values complete
  #Now reconstruct the probabilities
  p = f/n
  for i=1:n, j=1:n
    if (a[j]==i) p[i] += (1-f[j])/n end
  end
  a, f, p
end

##################
# Algorithm H2PE #
##################

# H2PE (Hypergeometric 2-Point Exponential) generator
#
# The new algorithm developed in Kachitvichyanukul and Schmeiser (1981).
#
# This takes an optional argument do_approx which controls whether or not to
# use the one-sided approximations in calculating the rescaled hypergeometric
# probability mass function, f. By default, do_approx=true, and is a faithful
# implementation of the H2PE algorithm; do_approx=false uses the exact method
# always.
function randhg_h2pe(n1✽::Int, n2✽::Int, k✽::Int, do_approx::Bool)
  USE_HIN_THRESH = 10
  #Step 0. Set up constants as function of n1✽, n2✽, k✽
  #0.0. Determine parameter set for more efficient generation
  n = n1✽ + n2✽
  n1, n2 = n1✽>n2✽ ? (n2✽, n1✽) : (n1✽, n2✽)
  k = 2k✽<=n ? k✽ : n-k✽
  #0.1 Set up constants
  M = floor((k+1)*(n1+1)/(n+2))
  if (M-max(0,k-n2))<USE_HIN_THRESH return randhg_hin(n1✽, n2✽, k✽) end
  A = lfact(M) + lfact(n1-M) + lfact(k-M) + lfact(n2-k+M)
  D = 1.5*sqrt((n-k)*k*n1*n2/((n-1)*n*n)) + 0.5
  xL, xR = M-D+0.5, M+D+0.5
  kL = exp(A-lfact(xL)-lfact(n1-xL)-lfact(k-xL)-lfact(n2-k+xL))
  #The paper has a sign error in the last term of kR
  kR = exp(A-lfact(xR-1)-lfact(n1-xR+1)-lfact(k-xR+1)-lfact(n2-k+xR-1))
  λL = -log(xL*(n2-k+xL)) + log((n1-xL+1)*(k-xL+1))
  λR = -log((n1-xR+1)*(k-xR+1)) + log(xR*(n2-k+xR))
  p1 = 2D
  p2 = p1 + kL/λL #Typo in paper
  p3 = p2 + kR/λR
  y = 0 #Need to define outside to obey Julia scoping rules
  #Step 1. Begin logic to generate one hypergeometric variate y.
  # Generate u~U(0,p3) for selecting the region,
  #          v~U(0,1) for the accept/reject decision.
  # If region 1 is selected, generate a uniform variate between xL and xR.
  while true #Keep sampling until we find an acceptable variate
    u, v = rand()*p3, rand()
    if u > p1
      #Step 2. Region 2, left exponential tail
      if u <= p2
        y = int(floor(xL + log(v)/λL))
        if y < max(0, k-n2) continue end
        v *= (u-p1)*λL
      else
        #Step 3. Region 3. right exponential tail
        y = int(floor(xR - log(v)/λR))
        if y > min(n1, k) continue end
        v *= (u-p2)*λR
      end
    else
      y = int(floor(xL + u))
    end

    #Step 4. Acceptance/rejection comparison
    #4.0 test for appropriate method of evaluating f(y)
    if !(do_approx && (M >=100 && y > 50)) #Use exact method
      #4.1 Evaluate f(y) recursively via
      #\[
      # f(y+1) = f(y) \frac{(n1-y)(k-y)}{(y+1)(n2-k+y+1}
      #\]
      #Start the search from the mode
      f = 1.0
      if M < y
        for i = M:y
          f *= (n1-i+1)*(k-i+1)/(i*(n2-k+i))
        end
      else
        if M > y
          for i=y:M
            f *= i*(n2-k+i)/((n1-i)*(k-i))
          end
        end
      end
      if v>f continue else break end
    else #Use approximate method
      #4.2 Squeezing. Check the value of ln(v) against upper and lower bound of
      #ln(f).
      x, y1, yM = y, y+1, y-M
      yn, yk, nk = n1-y+1, k-y+1, n2-k+y1
      R, S, T = -yM/y1, yM/yn, yM/yk
      E, G = -yM/nk, yn * yk / (y1 * nk) - 1
      DG = G>=0 ? 1 : 1 + G
      GU = G*(1 + G*(-0.5+G/3.0))
      GL = GU - G^4/(4DG)
      XM, Xn, Xk = M+0.5, n1-M+0.5, k-M+0.5
      nM = n2-k+XM
      Ub = XM*R*(1+R*(-0.5+R/3.0))
         + Xn*S*(1+S*(-0.5+S/3.0))
         + Xk*T*(1+T*(-0.5+T/3.0))
         + nM*E*(1+E*(-0.5+E/3.0)) + y*GU - M*GL + 0.0034
      Av = log(v)
      if Av > Ub continue end
      DR = XM * R^4
      if R < 0 DR /= 1+R end
      DS = Xn * S^4
      if S < 0 DS /= 1+S end
      DT = Xk * T^4
      if T < 0 DT /= 1+T end
      DE = nM * E^4
      if E < 0 DE /= 1+E end
      if Av < Ub-0.25*(DR + DS + DT + DE) + (y+M)*(GL-GU)-0.0078 break end

      #4.3 Final acceptance/rejection test
      if Av <= A-lfact(y)-lfact(n1-y)-lfact(k-y)-lfact(n2-k+y) break end
    end #one-sided approximations
  end #keep sampling until break condition

  #Step 5. Return appropriate random variate
  if 2k✽ < n #5.1
    return n1✽<=n2✽ ? y : k✽-y
  else #5.2
    return n1✽<=n2✽ ? n1✽-y : k✽-n2✽-y
  end
end

#The default is to do the approximation
randhg_h2pe(n1::Int, n2::Int, k::Int)=randhg_h2pe(n1, n2, k::Int, true)

#######################################################
# A very simple test driver to compare the algorithms #
#######################################################

NN = 100000 #Number of trials
for (n1, n2, k) in [(20, 20, 20), (100, 100, 20), (1000, 1000, 20), (10000,
    100, 1000), (100, 100000, 10000), (1000, 10000, 1000), (10000, 1000, 1000),
    (1000, 100000, 10000), (10000, 100, 10000)] #The parameters used in Table 1
  @printf("\nGenerating %d random hypergeometric variates with n1=%d, n2=%d, k=%d\n",
    NN, n1, n2, k)
  @printf("Algorithm   mean      \tstd. dev.\t runtime\n")
  x1 = @elapsed z1 = [randhg_hbu(n1, n2, k) for t=1:NN]
  @printf("HBU\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z1), std(z1), x1)
  x2 = @elapsed z2 = [randhg_hin(n1, n2, k) for t=1:NN]
  @printf("HIN\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z2), std(z2), x2)
  #x3 = @elapsed z3 = [randhg_halias(n1, n2, k) for t=1:NN]
  #@printf("HALIAS\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
  x3 = @elapsed z3 = [randhg_h2pe(n1, n2, k) for t=1:NN]
  @printf("H2PE\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
  #Exact variant of H2PE
  x4 = @elapsed z4 = [randhg_h2pe(n1, n2, k, false) for t=1:NN]
  @printf("H2PEx\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z4), std(z4), x4)
  #The rng in Distributions
  Z = HyperGeometric(n1, n2, k)
  x5 = @elapsed z5 = [rand(Z) for t=1:NN]
  @printf("Julia\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z5), std(z5), x5)
end
	# Hypergeometric random variate generators
	#
	# (c) 2013 Jiahao Chen <jiahao@mit.edu>
	#
	# This file implements several random variate generators as described in the
	# reference paper.
	#
	# Algorithm Function
	# HBU randhg_hbu
	# HIN randhg_hin
	# HALIAS randhg_alias
	# H2PE randhg_h2pe
	#
	# Reference:
	# "Computer generation of hypergeometric random variates",
	# V. Kachitvichyanukul and B. Schmeiser, J. Stat. Comput. Simul. 22 (2),
	# 1985, 127-145, doi:10.1080/00949658508810839

	using Distributions

	#################
	# Algorithm HBU #
	#################

	#Naive generation using sampling without replacement
	function randhg_hbu_core(n1::Int, n2::Int, k::Int)
	n = n1 + n2
	T, T1 = n, n1
	x = 0
	for J=0:min(k, n-k)
	u = rand()
	if u <= (T1/T)
	x += 1
	if x==n1 return x end
	T1 -= 1
	end
	T -= 1
	end
	x
	end

	#For large k, swap the arguments around using symmetry of the hypergeometric
	#distribution
	function randhg_hbu(n1::Int, n2::Int, k::Int)
	2k > (n1 + n2) ? n1 - randhg_hbu_core(n1, n2, n1+n2-k) : randhg_hbu_core(n1, n2, k)
	end

	#################
	# Algorithm HIN #
	#################

	#Inverse transformation
	function randhg_hin_core(n1::Int, n2::Int, k::Int)
	n = n1+n2
	if k<n2
	#p = factorial(n2)factorial(n-k)/(factorial(n)factorial(n2-k))
	p = exp(lfact(n2)+lfact(n-k)-lfact(n)-lfact(n2-k))
	x = 0
	else
	#p = factorial(n1)factorial(k)/(factorial(n)factorial(k-n2))
	p = exp(lfact(n1)+lfact(k)-lfact(n)-lfact(k-n2))
	x = k-n2
	end
	u = rand()
	while u > p #Typo in paper
	u -= p
	p = (n1-x)(k-x)/((x+1)*(n2-k+1+x))
	x += 1
	if x == n error("Inverse transformation did not converge") end
	end
	x
	end

	function randhg_hin(n1::Int, n2::Int, k::Int)
	if n1 <= n2
	if 2k <= (n1 + n2)
	return randhg_hin_core(n1, n2, k)
	else
	return n1-randhg_hin_core(n1, n2, n1+n2-k)
	end
	else
	if 2k <= (n1 + n2)
	return k-randhg_hin_core(n2, n1, k)
	else
	return k-n2+randhg_hin_core(n2, n1, n1+n2-k)
	end
	end
	end

	####################
	# Algorithm HALIAS #
	####################

	# The alias method
	#XXX This doesn't work
	function randhg_halias(n1::Int, n2::Int, k::Int)
	#Step 1
	IL = max(0, k-n2)
	IU = min(k, n2)
	IL, IU = min(IL, IU), max(IL, IU)
	m = IU-IL+1
	#Step 2. Calculate the hypergeometric probabilities for all mass points I_L...I_U
	probabilities = pmf(HyperGeometric(n1, n2, k), IL:IU) #This really should be lowercase G
	#Step 3. Setup the alias table A and table of its cutoff values f
	A, F, _ = arbran(probabilities)
	#To generate one hypergeometric random variate:
	#Step 4.
	u = rand() * m
	i = int(ceil(u))
	#Step 5.
	x = (u>F[i]) ? A[i] : i
	#Step 6.
	x+IL-1
	end

	# Compute the aliases and cutoff values for the desired probability distribution.
	# Input:
	# e: The desired probability values
	# n: number of variables
	# Output:
	# a: alias values
	# f: cutoff values
	# p: reconstructed probabilities
	# Citation:
	# "An Efficient Method for Generating Discrete Random Variables with General
	# Distributions"
	# A. J. Walker, ACM Trans. Math. Software 3 (3), 1977, 253-256
	# doi:10.1145/355744.355749
	function arbran{X<:Real}(e::Vector{X})
	n = length(e)
	error = .1e-5
	#Initialize output
	a = [1:n]
	f = zeros(n)
	b = e - 1/n
	#Find largest positive and negative differences and their positions in b
	c, k= minimum(b), indmin(b)
	d, l= maximum(b), indmax(b)
	#Test whether the sum of differences in b have become significant
	if sum(abs(b)) >= error
	#Assign alias and cutoff values
	a[k], f[k], b[k], b[l] = l, 1+c*n, 0, c+d
	end
	#Computation of alias and cutoff values complete
	#Now reconstruct the probabilities
	p = f/n
	for i=1:n, j=1:n
	if (a[j]==i) p[i] += (1-f[j])/n end
	end
	a, f, p
	end

	##################
	# Algorithm H2PE #
	##################

	# H2PE (Hypergeometric 2-Point Exponential) generator
	#
	# The new algorithm developed in Kachitvichyanukul and Schmeiser (1981).
	#
	# This takes an optional argument do_approx which controls whether or not to
	# use the one-sided approximations in calculating the rescaled hypergeometric
	# probability mass function, f. By default, do_approx=true, and is a faithful
	# implementation of the H2PE algorithm; do_approx=false uses the exact method
	# always.
	function randhg_h2pe(n1✽::Int, n2✽::Int, k✽::Int, do_approx::Bool)
	USE_HIN_THRESH = 10
	#Step 0. Set up constants as function of n1✽, n2✽, k✽
	#0.0. Determine parameter set for more efficient generation
	n = n1✽ + n2✽
	n1, n2 = n1✽>n2✽ ? (n2✽, n1✽) : (n1✽, n2✽)
	k = 2k✽<=n ? k✽ : n-k✽
	#0.1 Set up constants
	M = floor((k+1)*(n1+1)/(n+2))
	if (M-max(0,k-n2))<USE_HIN_THRESH return randhg_hin(n1✽, n2✽, k✽) end
	A = lfact(M) + lfact(n1-M) + lfact(k-M) + lfact(n2-k+M)
	D = 1.5sqrt((n-k)kn1n2/((n-1)nn)) + 0.5
	xL, xR = M-D+0.5, M+D+0.5
	kL = exp(A-lfact(xL)-lfact(n1-xL)-lfact(k-xL)-lfact(n2-k+xL))
	#The paper has a sign error in the last term of kR
	kR = exp(A-lfact(xR-1)-lfact(n1-xR+1)-lfact(k-xR+1)-lfact(n2-k+xR-1))
	λL = -log(xL(n2-k+xL)) + log((n1-xL+1)(k-xL+1))
	λR = -log((n1-xR+1)(k-xR+1)) + log(xR(n2-k+xR))
	p1 = 2D
	p2 = p1 + kL/λL #Typo in paper
	p3 = p2 + kR/λR
	y = 0 #Need to define outside to obey Julia scoping rules
	#Step 1. Begin logic to generate one hypergeometric variate y.
	# Generate u~U(0,p3) for selecting the region,
	# v~U(0,1) for the accept/reject decision.
	# If region 1 is selected, generate a uniform variate between xL and xR.
	while true #Keep sampling until we find an acceptable variate
	u, v = rand()*p3, rand()
	if u > p1
	#Step 2. Region 2, left exponential tail
	if u <= p2
	y = int(floor(xL + log(v)/λL))
	if y < max(0, k-n2) continue end
	v = (u-p1)λL
	else
	#Step 3. Region 3. right exponential tail
	y = int(floor(xR - log(v)/λR))
	if y > min(n1, k) continue end
	v = (u-p2)λR
	end
	else
	y = int(floor(xL + u))
	end

	#Step 4. Acceptance/rejection comparison
	#4.0 test for appropriate method of evaluating f(y)
	if !(do_approx && (M >=100 && y > 50)) #Use exact method
	#4.1 Evaluate f(y) recursively via
	#\[
	# f(y+1) = f(y) \frac{(n1-y)(k-y)}{(y+1)(n2-k+y+1}
	#\]
	#Start the search from the mode
	f = 1.0
	if M < y
	for i = M:y
	f = (n1-i+1)(k-i+1)/(i*(n2-k+i))
	end
	else
	if M > y
	for i=y:M
	f = i(n2-k+i)/((n1-i)*(k-i))
	end
	end
	end
	if v>f continue else break end
	else #Use approximate method
	#4.2 Squeezing. Check the value of ln(v) against upper and lower bound of
	#ln(f).
	x, y1, yM = y, y+1, y-M
	yn, yk, nk = n1-y+1, k-y+1, n2-k+y1
	R, S, T = -yM/y1, yM/yn, yM/yk
	E, G = -yM/nk, yn * yk / (y1 * nk) - 1
	DG = G>=0 ? 1 : 1 + G
	GU = G(1 + G(-0.5+G/3.0))
	GL = GU - G^4/(4DG)
	XM, Xn, Xk = M+0.5, n1-M+0.5, k-M+0.5
	nM = n2-k+XM
	Ub = XMR(1+R*(-0.5+R/3.0))
	+ XnS(1+S*(-0.5+S/3.0))
	+ XkT(1+T*(-0.5+T/3.0))
	+ nME(1+E(-0.5+E/3.0)) + yGU - M*GL + 0.0034
	Av = log(v)
	if Av > Ub continue end
	DR = XM * R^4
	if R < 0 DR /= 1+R end
	DS = Xn * S^4
	if S < 0 DS /= 1+S end
	DT = Xk * T^4
	if T < 0 DT /= 1+T end
	DE = nM * E^4
	if E < 0 DE /= 1+E end
	if Av < Ub-0.25(DR + DS + DT + DE) + (y+M)(GL-GU)-0.0078 break end

	#4.3 Final acceptance/rejection test
	if Av <= A-lfact(y)-lfact(n1-y)-lfact(k-y)-lfact(n2-k+y) break end
	end #one-sided approximations
	end #keep sampling until break condition

	#Step 5. Return appropriate random variate
	if 2k✽ < n #5.1
	return n1✽<=n2✽ ? y : k✽-y
	else #5.2
	return n1✽<=n2✽ ? n1✽-y : k✽-n2✽-y
	end
	end

	#The default is to do the approximation
	randhg_h2pe(n1::Int, n2::Int, k::Int)=randhg_h2pe(n1, n2, k::Int, true)

	#######################################################
	# A very simple test driver to compare the algorithms #
	#######################################################

	NN = 100000 #Number of trials
	for (n1, n2, k) in [(20, 20, 20), (100, 100, 20), (1000, 1000, 20), (10000,
	100, 1000), (100, 100000, 10000), (1000, 10000, 1000), (10000, 1000, 1000),
	(1000, 100000, 10000), (10000, 100, 10000)] #The parameters used in Table 1
	@printf("\nGenerating %d random hypergeometric variates with n1=%d, n2=%d, k=%d\n",
	NN, n1, n2, k)
	@printf("Algorithm mean \tstd. dev.\t runtime\n")
	x1 = @elapsed z1 = [randhg_hbu(n1, n2, k) for t=1:NN]
	@printf("HBU\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z1), std(z1), x1)
	x2 = @elapsed z2 = [randhg_hin(n1, n2, k) for t=1:NN]
	@printf("HIN\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z2), std(z2), x2)
	#x3 = @elapsed z3 = [randhg_halias(n1, n2, k) for t=1:NN]
	#@printf("HALIAS\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
	x3 = @elapsed z3 = [randhg_h2pe(n1, n2, k) for t=1:NN]
	@printf("H2PE\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3)
	#Exact variant of H2PE
	x4 = @elapsed z4 = [randhg_h2pe(n1, n2, k, false) for t=1:NN]
	@printf("H2PEx\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z4), std(z4), x4)
	#The rng in Distributions
	Z = HyperGeometric(n1, n2, k)
	x5 = @elapsed z5 = [rand(Z) for t=1:NN]
	@printf("Julia\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z5), std(z5), x5)
	end