Skip to content

Instantly share code, notes, and snippets.

@gpoulter
Created April 10, 2012 13:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gpoulter/2351314 to your computer and use it in GitHub Desktop.
Save gpoulter/2351314 to your computer and use it in GitHub Desktop.
Multi-hypergeometric distribution
#!/usr/bin/python
"""Calculates the distribution of a weighted sum of the components of
a multivariate hypergeometric random variable, for the special case of
three components with weights -1, 0 and +1 - although the generating
function can handle any weights and number of components. Given a
value t, it also calculates the p value of t under the null hypothesis
that it was generated as the weighted sum of the multivariate
hypergeometric variable.
Usage:
python multihypergeo.py f_1 f_2 f_3 n t
Where:
f_1, f_2, f_3 = Population size for each of the three components,
across all of the data.
n = Make n observations without replacement, resulting in x_1, x_2
and x_3 observations of the three outcomes, having weights w_i of -1,
0 and +1.
t = The weighted sum of the n observations: t = -1*x_1 + 0*x_2 + 1*x_3,
whose p-value is to be calculated.
MULTIVARIATE HYPERGEOMETRIC DISTRIBUTION:
See http://www.math.uah.edu/stat/urn/MultiHypergeometric.xhtml
If an urn has s types of balls in it, with populations f_1,...,f_s,
and n balls are drawn from the urn without replacement, then random
variable for the number of each type of ball drawn, X(f,n) = (X_1,
X_2, ... X_s), is multivariate hypergeometrically distributed.
SPECIFIC CASE:
In this case, the urn has s=3 different types of balls in it, with
weights w_i of -1, 0, +1. There are f_1, f_2 and f_3 of each type
present in the urn. There are m=f_1+f_2+f_3 balls in total. n balls
are drawn from the urn without replacement, and the result is
multivariate hypergeometrically distributed X(f,n) =(X_1,X_2,X_2). The
weighted sum of the draws is the random variable Y = -1*X_1 +
0*X_2 + 1*X_3.
The distribution of Y is calculated by enumerating P(X=x) over all
possible outcomes x=(x_1,x_2,x_3). For each outcome we calculate
y=-1*x_1+0*x_2+1*x_3, and contribute P(X=x) to the total for P(Y=y).
The p value of some value t under the null hypothesis that it was
generated by the random variable Y is the smaller of P(Y >= t) and
P(Y <= t) - evaluated by summing over the discrete probabilities in
the calculated distribution of Y.
"""
from __future__ import division
from collections import defaultdict
import sys
def factorial(N):
"""Factorial of N."""
if N == 0: return 1.0
r = 1.0
# Perform the multiplications 1 * 2 * 3 * 4 * ... * N
for i in range(2, N+1):
r *= i
return r
def C(n,r):
"""Number of ways to choose r out of n items."""
nCr = 1
# Because C(n,r) == C(n,n-r), and loop is over r, so we want r to be small
if (r*2) > n:
r = n-r
for i in range(r-1, -1, -1): # r-1 ... 0
nCr = (nCr*(n-i)) // (r-i)
return nCr
def weighted_sum_of_multivariate_hypergeometric(w, f, n):
"""Construct the the sum-of-multivariate-hypergeometrics distribution
@param w: Weights w=(w_1,w_2,...w_s) for observations in each
component of the random variable X=(X_1,X_2,...,X_s). The
weighted sum random variable Y is the dot product Y = w*X.
@param f: Population size of the components (f_1,f_2,...,f_s). The
total population size is m=f_1+f_2+...+f_s
@param n: Make n observations without replacement.
@return: (values,mass) corresponding to the values and probability masses
of the discrete distribution Y=d*X, being the weighted sum of X.
"""
s = len(w)
assert len(f) == s
# m is the size of the population (f_1+f_2+...+f_s)
m = sum(f)
# Y[y] is P[Y=y], the probability of observing Y=y
Y = defaultdict(float)
# maxfree[i] is the maximum number of balls that can
# be drawn in total from components >= i
maxfree = list(f)
for i in range(s-2, -1, -1): # s-2 ... 0
maxfree[i] += maxfree[i+1]
maxfree += [0]
def enumerate_probabilities(i, free, y, g):
"""Recursively enumerate all of the possible outcomes. Add each outcomes
probability to its weighted sum.
@param i: Component of population to enumerate.
@param free: Observations (out of original n) remaining to be made.
@param y: Weighted sum of the observations.
@param g: Probability of the observed outcome.
"""
if free == 0:
# Add the probability of this outcome to total probability of
# observing the weighted sum y
Y[y] += g
else:
# Constraints should ensure that we use up all the observations in time.
assert i < s
# k = number of balls drawn from this component
# free-maxfree[i+1] = minimum number of balls that could be drawn from this component.
# min(f[i],free) = maximum number of balls that could be drawn from this component.
# w[i]*k = weighted contribution of balls drawn from this component
# C(f[i],k) = number of ways to choose k balls from f[i] in the component.
for k in range(max(0, free-maxfree[i+1]), min(f[i], free)+1):
enumerate_probabilities(i+1, free-k, y+(w[i]*k), g*C(f[i],k))
enumerate_probabilities(i=0, free=n, y=0, g=1)
values = sorted(Y.keys())
# Finish the probability using the shared C(m,n) denominator
# http://www.math.uah.edu/stat/urn/MultiHypergeometric.xhtml
mass = [Y[k]/C(m,n) for k in values]
return (values, mass)
def discrete_pvalue(values, mass, t):
"""Calculate the p-value of a test value t in a discrete
distribution. That is, the sum of the probability mass that
corresponds to values at least as extreme as t.
@param values: Values of the discrete distribution.
@param mass: Distribution mass associated with each value.
@param t: Value to test for significance.
"""
print "The discrete distribution is: "
for (v,d) in zip(values,mass):
print v, d
print "Sum of probability mass (should be 1.0): %f" % sum(mass)
print ("Mean (expected value): %f" %
sum(v*d for (v,d) in zip(values,mass)))
print "Calculating the p-value of %s" % str(t)
try:
i = values.index(t)
pval_left = sum(mass[j] for j in range(0, i+1))
pval_right = sum(mass[j] for j in range(i, len(values)))
pval = min(pval_right, pval_left)
print "1-tailed p-value: ", pval
except ValueError, e:
print "Test value %d is not one of the values in the distribution." % t
def plot_density(values, mass):
"""For a discrete distribtion, plut probability mass on Y axis against
values of the distribution on the X axis."""
from matplotlib import pylab
pylab.plot(values, mass)
print "Press any key to exit the program."
raw_input()
def main(args):
"""Calculate p-values under the null hypothesis of a weighted sum
of a multivariate hypergeometric random variable.
"""
try:
args = [int(v) for v in args[1:]]
f = args[0:3]
n = args[3]
t = args[4]
except Exception:
print __doc__
# Weights for occurences in each component
w = [-1, 0, 1]
(values,mass) = weighted_sum_of_multivariate_hypergeometric(w, f, n)
discrete_pvalue(values, mass, t)
#plot_density(values, mass)
if __name__ == "__main__":
main(sys.argv)
# Let X be multivariate hypergeometrically distributed with s
# categories having assigned values x_1,...x_s having f_1,...f_s
# instances in the urn. m is the sum of all the f_i.
# Let Y be the sum of of n observations from X. Y is discrete random
# variable of s' values y_1,...y_s' with frequency g_1,...g_s'.
# Y will range from n*x_0 to n*x_s, and s' is given by
# | Outer Sum from 1..n of the set { x_0,...,x_s } |
from __future__ import division
import sys
# Factorial of N
def factorial( N ):
if N == 0:
return 1.0
else:
r = 1.0
for i in range( 2, N+1 ):
r *= i
return r
# Number of combinations of r items taken from n
def combinations( n, r ):
nCr = 1
if (r*2) > n:
r = n-r
for i in range( r-1, -1, -1 ):
nCr = ( nCr * ( n - i ) ) // ( r - i )
return nCr
# Enumerate probability mass pieces from the sum-of-multivariate-hypergeometrics distribution
# Y, g, s, m, n, f, x are described at the top
# i is the number of the category from X that we are processing
# prob[Y] is the probability of observing sum Y
# free is the number of observations (out of n) that remain to be used
# maxfree[i] is the maximum allowed value of free at point i
# k is the number of observations to use from category i
def enumerate_mhyper_sum( i, prob, free, maxfree, Y, g, s, m, n, f, x ):
if free == 0:
if Y not in prob: prob[Y] = 0
prob[Y] += g
else:
for k in range( max( 0, free-maxfree[i+1] ), min( f[i], free ) + 1 ):
enumerate_mhyper_sum( i+1, prob, free-k, maxfree, Y+(x[i]*k), g*combinations(f[i],k), s, m, n, f, x )
# Construct the sum-of-multivariate-hypergeometrics distribution
# Return a discrete distribution such that values[i] has a frequency
# of mass[i]
def construct_mhyper_sum( x, f, n ):
s = len(x)
m = sum(f)
prob = dict()
maxfree = list( f )
for i in range( s-2, -1, -1 ):
maxfree[i] += maxfree[i+1]
maxfree += [ 0 ]
enumerate_mhyper_sum( 0, prob, n, maxfree, 0, 1, s, m, n, f, x )
values = []
mass = []
for k in sorted( prob.keys() ):
values.append( k )
mass.append( prob[k] / combinations(m,n) )
return ( values, mass )
n = int(sys.argv[4])
x = [ -1, 0, 1 ]
f = [ int(sys.argv[1]),int(sys.argv[2]) , int(sys.argv[3]) ]
t = int(sys.argv[5])
#print f
(values,mass) = construct_mhyper_sum( x, f, n )
#for (v,d) in zip(values,mass):
# print v, d
#print "Probability Mass Sums to: %f" % sum(mass)
#print "Population Mean (expected sum of %d observations) is: %f" % (n, sum( v*d for (v,d) in zip(values,mass) ) )
pval = 0
#print t #debug
for num in range(len(values)):
if t == values[num]:
if t >= 0:
for calc in range( num, len(values) ):
# print mass[calc] #debug
pval = pval + mass[calc]
else:
for calc in range( 0, num+1 ):
pval = pval + mass[calc]
#print mass[calc] #debug
print "1-tailed: ", pval
print "2-tailed: ", pval*2
#from matplotlib import pylab as p
#p.plot( vals, density )
#input()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment