public
Last active

Issue with multivariate Polya distribution estimation

  • Download Gist
polya_issue.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
import numpy as np
from scipy.special import gammaln
 
def log_multivariate_polya(X, alpha):
N = X.sum()
A = alpha.sum()
log_likelihood = gammaln(N+1) - gammaln(X+1).sum()
log_likelihood += gammaln(A) - gammaln(alpha).sum()
log_likelihood += gammaln(X + alpha).sum() - gammaln(N + A)
return log_likelihood
 
def logmean(loga):
return reduce(np.logaddexp, loga) - np.log(loga.size)
 
def log_multivariate_polya_montecarlo(X, alpha, iterations=1e5):
Theta = np.random.dirichlet(alpha, size=int(iterations))
logps = gammaln(X.sum() + 1) - gammaln(X + 1).sum()
logps += (X * np.log(Theta)).sum(1)
return logmean(logps)
 
if __name__ == '__main__':
 
np.random.seed(0)
 
X = np.array([0,50,50])
alpha = np.array([1,10,10])
print "X:", X
print "alpha:", alpha
print "analytic:", log_multivariate_polya(X, alpha)
print "montecarlo", log_multivariate_polya_montecarlo(X, alpha)
 
print
X = np.array([100,0,0])
alpha = np.array([1,10,10])
print "X:", X
print "alpha:", alpha
print "analytic:", log_multivariate_polya(X, alpha)
print "montecarlo", log_multivariate_polya_montecarlo(X, alpha)

This is the output if you run this code:


X: [ 0 50 50]
alpha: [ 1 10 10]
analytic: -5.22892710577
montecarlo -5.23470053651

X: [100 0 0]
alpha: [ 1 10 10]
analytic: -51.737395965
montecarlo -93.5266543113


Why there is such a big disagreement between the two implementations in the second case?

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.