Last active
June 6, 2022 12:45
-
-
Save jseabold/6617976 to your computer and use it in GitHub Desktop.
Zero-Inflated Poisson Estimation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy import special, stats | |
from statsmodels.base.model import GenericLikelihoodModel | |
def vuong_test(p1, p2): | |
r""" | |
Vuong-test for non-nested models. | |
Parameters | |
---------- | |
p1 : array-like | |
f1(Y=y_i | x_i) | |
p2 : array-like | |
f2(Y=y_i | x_i) | |
Notes | |
----- | |
This is hard-coded for testing Poisson vs. Zero-inflated. E.g., | |
it does not account for | |
Let f_j(y_i|x_i) denote the predicted probability that random variable Y | |
equals y_i under the assumption that the distribution is f_j(y_i|x_i) for | |
j = 1,2. Let | |
.. math:: | |
m_i = log(\frac{f_1(y_i|x_i)}{f_2(y_i|x_i)}) | |
The test statistic from Vuong to test the hypothesis of Model 1 vs. | |
Model 2 is | |
.. math:: | |
v = \frac{\sqrt{n}(1/n \sum_{i=1}^{n}m_i)}{\sqrt{1/n \sum_{i=1}^{n}(m_i - \bar{m})^2}} | |
This statistic has a limiting standard normal distribution. Values of | |
v greater than ~2, indicate that model 1 is preferred. Values of V | |
less than ~-2 indicate the model 2 is preferred. Values of |V| < ~2 are | |
inconclusive. | |
References | |
---------- | |
Greene, W. Econometric Analysis. | |
Vuong, Q.H. 1989 "Likelihood ratio tests for model selection and | |
non-nested hypotheses." Econometrica. 57: 307-333. | |
""" | |
m = np.log(p1) - np.log(p2) | |
n = len(m) | |
v = n ** .5 * m.mean() / m.std() | |
return v, stats.norm.sf(np.abs(v)) | |
def poisson_pmf(x, mu): | |
return mu ** x / special.gamma(x + 1) * np.exp(-mu) | |
def trunc_poisson_pmf(x, mu): | |
# 10x faster than poisson.pmf(x, mu)/(1 - poisson.cdf(0, mu)) | |
return mu ** x * np.exp(-mu) / (special.gamma(x + 1) * (1 - np.exp(-mu))) | |
def trunc_poisson_logpmf(x, mu): | |
# 10x faster than poisson.logpmf(x, mu) - np.log1(-poisson.cdf(0, mu)) | |
if not np.all(x > 0): | |
raise ValueError("x must be positive counts") | |
if not mu > 0: | |
raise ValueError("mu must be a postive real") | |
x = np.asarray(x) | |
pdf = x * np.log(mu) - mu - special.gammaln(x + 1) - np.log1p(-np.exp(-mu)) | |
return pdf | |
class ZIPoisson(GenericLikelihoodModel): | |
def __init__(self, y, X, inflateX=None, missing='none'): | |
#TODO: inflateX needs to go through the pandas data rigamarole | |
if inflateX is None: | |
inflateX = np.ones((len(y), 1)) | |
else: | |
inflateX = np.asarray(inflateX) | |
super(ZIPoisson, self).__init__(y, X, inflateX=inflateX, | |
missing=missing) | |
self.link = None # chosen during fit | |
self.k_exog = X.shape[1] | |
self.k_inflate = inflateX.shape[1] | |
def loglikeobs(self, params): | |
y = self.endog | |
zero_idx = np.nonzero(y == 0)[0] | |
nonzero_idx = np.nonzero(y)[0] | |
X = self.exog[nonzero_idx] | |
inflateX = self.inflateX | |
# separate params into zero and non-zero | |
params_X = params[:self.k_exog] | |
params_inflate = params[self.k_exog:] | |
# linear predictions | |
XB_nonzero = np.dot(X, params_X) # + offset_nonzero | |
inflateXB = np.dot(inflateX, params_inflate) # + offset_zero | |
# probability of zero event | |
link = self.link or stats.logistic.cdf # in case not set in fit yet | |
p = link(inflateXB) | |
# mu is lambda in the literature, mu in stats.poisson | |
# log(lambda_i) = XB_i | |
mu = np.exp(np.dot(self.exog, params_X)) | |
# cf. Lambert (1992) Zero-Inflated Poisson Regression... | |
p_zero = p[zero_idx] | |
p_nonzero = p[nonzero_idx] | |
y_nonzero = y[nonzero_idx] | |
llf_zero = np.log(p_zero + (1 - p_zero)*np.exp(-mu[zero_idx])) | |
llf_nonzero = (np.log(1 - p_nonzero) - mu[nonzero_idx] + | |
XB_nonzero*y_nonzero - | |
special.gammaln(y_nonzero + 1)) | |
# not the most memory-friendly way to do this but certainly the | |
# laziest | |
llf_obs = np.zeros(len(y)) | |
llf_obs[zero_idx] = llf_zero | |
llf_obs[nonzero_idx] = llf_nonzero | |
return llf_obs | |
def loglike(self, params): | |
""" | |
Parameters | |
---------- | |
params : ndarray | |
Params is expected to contain the parameters for both the | |
exogenous variables and the parameters for modeling the excess | |
zeros in that order. | |
Notes | |
----- | |
The dependent variable Y is assumed to be distributed such that | |
Y_i ~ 0 with probability p_i and Y_i ~ Poisson(:math:`\mu_i`) with | |
probability 1 - p_i so that Y_i = 0 with probability | |
p_i + (1 - p_i)*exp(-\mu_i) and Y_i = k with probability | |
(1 - p_i)*exp(:math:`-\mu_i`)*:math:`mu_i`**k/k! for k = 1, 2, ... | |
where log(:math:`mu`) = dot(X, model_params) | |
and | |
logit(p) = log(p/(1-p)) = dot(inflateX, inflate_params) | |
or | |
probit(p) = norm.ppf(p) = dot(inflateX, inplate_params) | |
depending on the link function specified in the fit method. | |
References | |
----------:: | |
Lambert, D. (1992) "Zero-Inflated Poisson Regression, With an | |
Application to Defects in Manufacturing." Technometrics, | |
34, 1. | |
""" | |
# setup | |
y = self.endog | |
zero_idx = np.nonzero(y == 0)[0] | |
nonzero_idx = np.nonzero(y)[0] | |
X = self.exog[nonzero_idx] | |
inflateX = self.inflateX | |
# separate params into zero and non-zero | |
params_X = params[:self.k_exog] | |
params_inflate = params[self.k_exog:] | |
# linear predictions | |
XB_nonzero = np.dot(X, params_X) # + offset_nonzero | |
inflateXB = np.dot(inflateX, params_inflate) # + offset_zero | |
# probability of zero event | |
link = self.link or stats.logistic.cdf # in case not set in fit yet | |
p = link(inflateXB) | |
# mu is lambda in the literature, mu in stats.poisson | |
# log(lambda_i) = XB_i | |
mu = np.exp(np.dot(self.exog, params_X)) | |
# cf. Lambert (1992) Zero-Inflated Poisson Regression... | |
p_zero = p[zero_idx] | |
p_nonzero = p[nonzero_idx] | |
y_nonzero = y[nonzero_idx] | |
llf_zero = sum(np.log(p_zero + (1 - p_zero)*np.exp(-mu[zero_idx]))) | |
llf_nonzero = sum(np.log(1 - p_nonzero) - mu[nonzero_idx] + | |
XB_nonzero*y_nonzero - | |
special.gammaln(y_nonzero + 1)) | |
return llf_zero + llf_nonzero | |
def fit(self, start_params=None, link="logit", method='bfgs', maxiter=100, | |
full_output=True, disp=1, fargs=(), callback=None, | |
retall=False, **kwargs): | |
""" | |
Parameters | |
---------- | |
link : str {"logit", "probit"} | |
The link function for modeling the excess zeros. | |
""" | |
link = link.lower() | |
if link == "logit": | |
link = stats.logistic.cdf | |
elif link == "probit": | |
link = stats.norm.cdf | |
else: #pragma: no cover | |
raise ValueError("link %s not understood") | |
self.link = link | |
if start_params is None: | |
start_params = np.zeros(self.exog.shape[1] + | |
self.inflateX.shape[1]) + .1 | |
zip_fit = super(ZIPoisson, self).fit(start_params=start_params, | |
method=method, maxiter=maxiter, full_output=full_output, | |
disp=disp, callback=callback, **kwargs) | |
#TODO: make a results class and wrapper | |
# zip_fit = ZIPResults(self, zip_fit) | |
# return ZIPResultsWrapper(zip_fit) | |
zip_fit.model_params = zip_fit.params[:self.exog.shape[1]] | |
zip_fit.inflate_params = zip_fit.params[self.exog.shape[1]:] | |
return zip_fit | |
def test_params(): | |
from statsmodels.tools.tools import webuse | |
dta = webuse("fish") | |
y = dta["count"] | |
dta["const"] = 1. | |
X = dta[["const", "persons", "livebait"]] | |
inflateX = dta[["const", "child", "camper"]] | |
mod = ZIPoisson(y, X, inflateX) | |
res = mod.fit() | |
# from stata | |
params = np.array([-2.1784716320442, .80688527006274, 1.7572893795292, | |
-.49228715725108, 1.6025705051289, -1.0156983048851]) | |
np.testing.assert_almost_equal(res.params, params, 4) | |
np.testing.assert_almost_equal(res.llf, -850.7014181090408, 4) | |
if __name__ == "__main__": | |
import statsmodels.api as sm | |
from statsmodels.tools.tools import webuse | |
dta = webuse("fish") | |
y = dta["count"] | |
dta["const"] = 1. | |
X = dta[["const", "persons", "livebait"]] | |
inflateX = dta[["const", "child", "camper"]] | |
mod = ZIPoisson(y, X, inflateX) | |
res = mod.fit() | |
# from stata | |
params = np.array([-2.1784716320442, .80688527006274, 1.7572893795292, | |
-.49228715725108, 1.6025705051289, -1.0156983048851]) | |
p1 = np.exp(res.model.loglikeobs(res.params)) | |
res2 = sm.Poisson(y, X).fit(disp=0) | |
p2 = np.exp(stats.poisson.logpmf(res2.model.endog, | |
res2.predict(linear=False))) | |
vuong_test(p1, p2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment