Skip to content

Instantly share code, notes, and snippets.

@jseabold
Last active June 6, 2022 12:45
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jseabold/6617976 to your computer and use it in GitHub Desktop.
Save jseabold/6617976 to your computer and use it in GitHub Desktop.
Zero-Inflated Poisson Estimation
import numpy as np
from scipy import special, stats
from statsmodels.base.model import GenericLikelihoodModel
def vuong_test(p1, p2):
r"""
Vuong-test for non-nested models.
Parameters
----------
p1 : array-like
f1(Y=y_i | x_i)
p2 : array-like
f2(Y=y_i | x_i)
Notes
-----
This is hard-coded for testing Poisson vs. Zero-inflated. E.g.,
it does not account for
Let f_j(y_i|x_i) denote the predicted probability that random variable Y
equals y_i under the assumption that the distribution is f_j(y_i|x_i) for
j = 1,2. Let
.. math::
m_i = log(\frac{f_1(y_i|x_i)}{f_2(y_i|x_i)})
The test statistic from Vuong to test the hypothesis of Model 1 vs.
Model 2 is
.. math::
v = \frac{\sqrt{n}(1/n \sum_{i=1}^{n}m_i)}{\sqrt{1/n \sum_{i=1}^{n}(m_i - \bar{m})^2}}
This statistic has a limiting standard normal distribution. Values of
v greater than ~2, indicate that model 1 is preferred. Values of V
less than ~-2 indicate the model 2 is preferred. Values of |V| < ~2 are
inconclusive.
References
----------
Greene, W. Econometric Analysis.
Vuong, Q.H. 1989 "Likelihood ratio tests for model selection and
non-nested hypotheses." Econometrica. 57: 307-333.
"""
m = np.log(p1) - np.log(p2)
n = len(m)
v = n ** .5 * m.mean() / m.std()
return v, stats.norm.sf(np.abs(v))
def poisson_pmf(x, mu):
return mu ** x / special.gamma(x + 1) * np.exp(-mu)
def trunc_poisson_pmf(x, mu):
# 10x faster than poisson.pmf(x, mu)/(1 - poisson.cdf(0, mu))
return mu ** x * np.exp(-mu) / (special.gamma(x + 1) * (1 - np.exp(-mu)))
def trunc_poisson_logpmf(x, mu):
# 10x faster than poisson.logpmf(x, mu) - np.log1(-poisson.cdf(0, mu))
if not np.all(x > 0):
raise ValueError("x must be positive counts")
if not mu > 0:
raise ValueError("mu must be a postive real")
x = np.asarray(x)
pdf = x * np.log(mu) - mu - special.gammaln(x + 1) - np.log1p(-np.exp(-mu))
return pdf
class ZIPoisson(GenericLikelihoodModel):
def __init__(self, y, X, inflateX=None, missing='none'):
#TODO: inflateX needs to go through the pandas data rigamarole
if inflateX is None:
inflateX = np.ones((len(y), 1))
else:
inflateX = np.asarray(inflateX)
super(ZIPoisson, self).__init__(y, X, inflateX=inflateX,
missing=missing)
self.link = None # chosen during fit
self.k_exog = X.shape[1]
self.k_inflate = inflateX.shape[1]
def loglikeobs(self, params):
y = self.endog
zero_idx = np.nonzero(y == 0)[0]
nonzero_idx = np.nonzero(y)[0]
X = self.exog[nonzero_idx]
inflateX = self.inflateX
# separate params into zero and non-zero
params_X = params[:self.k_exog]
params_inflate = params[self.k_exog:]
# linear predictions
XB_nonzero = np.dot(X, params_X) # + offset_nonzero
inflateXB = np.dot(inflateX, params_inflate) # + offset_zero
# probability of zero event
link = self.link or stats.logistic.cdf # in case not set in fit yet
p = link(inflateXB)
# mu is lambda in the literature, mu in stats.poisson
# log(lambda_i) = XB_i
mu = np.exp(np.dot(self.exog, params_X))
# cf. Lambert (1992) Zero-Inflated Poisson Regression...
p_zero = p[zero_idx]
p_nonzero = p[nonzero_idx]
y_nonzero = y[nonzero_idx]
llf_zero = np.log(p_zero + (1 - p_zero)*np.exp(-mu[zero_idx]))
llf_nonzero = (np.log(1 - p_nonzero) - mu[nonzero_idx] +
XB_nonzero*y_nonzero -
special.gammaln(y_nonzero + 1))
# not the most memory-friendly way to do this but certainly the
# laziest
llf_obs = np.zeros(len(y))
llf_obs[zero_idx] = llf_zero
llf_obs[nonzero_idx] = llf_nonzero
return llf_obs
def loglike(self, params):
"""
Parameters
----------
params : ndarray
Params is expected to contain the parameters for both the
exogenous variables and the parameters for modeling the excess
zeros in that order.
Notes
-----
The dependent variable Y is assumed to be distributed such that
Y_i ~ 0 with probability p_i and Y_i ~ Poisson(:math:`\mu_i`) with
probability 1 - p_i so that Y_i = 0 with probability
p_i + (1 - p_i)*exp(-\mu_i) and Y_i = k with probability
(1 - p_i)*exp(:math:`-\mu_i`)*:math:`mu_i`**k/k! for k = 1, 2, ...
where log(:math:`mu`) = dot(X, model_params)
and
logit(p) = log(p/(1-p)) = dot(inflateX, inflate_params)
or
probit(p) = norm.ppf(p) = dot(inflateX, inplate_params)
depending on the link function specified in the fit method.
References
----------::
Lambert, D. (1992) "Zero-Inflated Poisson Regression, With an
Application to Defects in Manufacturing." Technometrics,
34, 1.
"""
# setup
y = self.endog
zero_idx = np.nonzero(y == 0)[0]
nonzero_idx = np.nonzero(y)[0]
X = self.exog[nonzero_idx]
inflateX = self.inflateX
# separate params into zero and non-zero
params_X = params[:self.k_exog]
params_inflate = params[self.k_exog:]
# linear predictions
XB_nonzero = np.dot(X, params_X) # + offset_nonzero
inflateXB = np.dot(inflateX, params_inflate) # + offset_zero
# probability of zero event
link = self.link or stats.logistic.cdf # in case not set in fit yet
p = link(inflateXB)
# mu is lambda in the literature, mu in stats.poisson
# log(lambda_i) = XB_i
mu = np.exp(np.dot(self.exog, params_X))
# cf. Lambert (1992) Zero-Inflated Poisson Regression...
p_zero = p[zero_idx]
p_nonzero = p[nonzero_idx]
y_nonzero = y[nonzero_idx]
llf_zero = sum(np.log(p_zero + (1 - p_zero)*np.exp(-mu[zero_idx])))
llf_nonzero = sum(np.log(1 - p_nonzero) - mu[nonzero_idx] +
XB_nonzero*y_nonzero -
special.gammaln(y_nonzero + 1))
return llf_zero + llf_nonzero
def fit(self, start_params=None, link="logit", method='bfgs', maxiter=100,
full_output=True, disp=1, fargs=(), callback=None,
retall=False, **kwargs):
"""
Parameters
----------
link : str {"logit", "probit"}
The link function for modeling the excess zeros.
"""
link = link.lower()
if link == "logit":
link = stats.logistic.cdf
elif link == "probit":
link = stats.norm.cdf
else: #pragma: no cover
raise ValueError("link %s not understood")
self.link = link
if start_params is None:
start_params = np.zeros(self.exog.shape[1] +
self.inflateX.shape[1]) + .1
zip_fit = super(ZIPoisson, self).fit(start_params=start_params,
method=method, maxiter=maxiter, full_output=full_output,
disp=disp, callback=callback, **kwargs)
#TODO: make a results class and wrapper
# zip_fit = ZIPResults(self, zip_fit)
# return ZIPResultsWrapper(zip_fit)
zip_fit.model_params = zip_fit.params[:self.exog.shape[1]]
zip_fit.inflate_params = zip_fit.params[self.exog.shape[1]:]
return zip_fit
def test_params():
from statsmodels.tools.tools import webuse
dta = webuse("fish")
y = dta["count"]
dta["const"] = 1.
X = dta[["const", "persons", "livebait"]]
inflateX = dta[["const", "child", "camper"]]
mod = ZIPoisson(y, X, inflateX)
res = mod.fit()
# from stata
params = np.array([-2.1784716320442, .80688527006274, 1.7572893795292,
-.49228715725108, 1.6025705051289, -1.0156983048851])
np.testing.assert_almost_equal(res.params, params, 4)
np.testing.assert_almost_equal(res.llf, -850.7014181090408, 4)
if __name__ == "__main__":
import statsmodels.api as sm
from statsmodels.tools.tools import webuse
dta = webuse("fish")
y = dta["count"]
dta["const"] = 1.
X = dta[["const", "persons", "livebait"]]
inflateX = dta[["const", "child", "camper"]]
mod = ZIPoisson(y, X, inflateX)
res = mod.fit()
# from stata
params = np.array([-2.1784716320442, .80688527006274, 1.7572893795292,
-.49228715725108, 1.6025705051289, -1.0156983048851])
p1 = np.exp(res.model.loglikeobs(res.params))
res2 = sm.Poisson(y, X).fit(disp=0)
p2 = np.exp(stats.poisson.logpmf(res2.model.endog,
res2.predict(linear=False)))
vuong_test(p1, p2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment