josef-pkt/try_negbin_cluster.py

## try_negbin_cluster.py
# -*- coding: utf-8 -*-
"""

Created on Sun Dec 08 19:14:58 2013

Author: Josef Perktold, based on example by John Beieler

Not verified!

"""
import numpy as np
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
from statsmodels.stats.sandwich_covariance import cov_cluster
import statsmodels.stats.sandwich_covariance as sc

dat_raw = pd.read_csv('negbin_full_data.csv')
dat = dat_raw.dropna()
#dat = dat_raw
y, X = dmatrices('terrorCounts~rivalry+jointDem1+logcapratio+contiguity',
                 data=dat, return_type='dataframe')

res_poi = sm.Poisson(y, X, missing='drop').fit()

# use estimated params to run faster during experimentation
start_params = np.array([ -5.35096895,   2.34653792,   1.32354659,
                         -0.15224597,    1.01714021,  64.78548562])

mod = sm.NegativeBinomial(y, X, missing='drop')
res0 = mod.fit(start_params=start_params, maxiter=60, method='bfgs')
#start_params = np.array(res0.params)
# see if newton works
res = mod.fit(start_params=start_params, maxiter=60, method='newton')

#check eigenvalues
print np.linalg.eigvals(res0.cov_params())
print np.linalg.eigvals(res.cov_params())

print "\nbse non-robust 'newton'"
print np.asarray(res.bse)

bses = pd.DataFrame(res.bse, columns=['non-robust'])

print "\nbse cluster robust 'newton'"

# this is not working yet
#clustered = cov_cluster(res, dat['dyadid'])

# NegativeBinomial has jac as scoreobs based on approx_fprime_cs
jac = res.model.scoreobs(np.array(res.params))


# fake a results instance that has the required attributes

class Holder(object):
    def __init__(self, **kwds):
        self.__dict__.update(kwds)

model_fake = Holder(exog=jac)
res_fake = Holder(model=model_fake, resid=np.ones(jac.shape[0]),
                  normalized_cov_params=res.normalized_cov_params)


gr = np.asarray(dat['dyadid'], int)   # need for bincount, why not int64?
gr -= gr.min()
clustered = cov_cluster(res_fake, gr)
bse_clu = np.sqrt(np.diag(clustered))
print bse_clu


# calculate directly
S = sc.S_crosssection(jac, gr)
# The next part is _HCCM, which shouldn't require results
xxi = res.normalized_cov_params
H = np.dot(np.dot(xxi, S), xxi.T)
bse_clu2 = np.sqrt(np.diag(H))
print bse_clu2

#add correction
nobs, k_vars = X.shape
n_groups = len(np.unique(gr))
corr_fact = n_groups / (n_groups - 1.) * ((nobs-1.) / float(nobs - k_vars))

bse_clu3 = np.sqrt(np.diag(H * corr_fact))
print bse_clu3
# should be same as using fake


bses['robust_corr'] = bse_clu
bses['robust_no_corr'] = bse_clu2
bses['robust_corr_'] = bse_clu3

print '\n'
print bses

''' Results

Note: "bfgs" ends up with singular Hessian, "newton" doesn't

[  4.84419405e+23  -2.99759693e+53  -6.64613998e+35  -1.12701301e+36
   9.00058591e+34  -1.86552394e+34]
[  1.91432527e+01   2.34646505e-02   1.52971738e-02   1.87594821e-03
   1.12834253e-03   9.22005732e-03]

bse non-robust 'newton'
[ 0.08170064  0.13790239  0.11111216  0.03587395  0.10845392  4.37528876]

bse cluster robust 'newton'
[ 0.18103571  0.46702637  0.27775385  0.06913378  0.25524573  9.47252558]
[ 0.18099112  0.46691133  0.27768544  0.06911675  0.25518286  9.4701923 ]
[ 0.18103433  0.4670228   0.27775173  0.06913326  0.25524379  9.4724533 ]


             non-robust  robust_corr  robust_no_corr  robust_corr_
Intercept      0.081701     0.181036        0.180991      0.181034
rivalry        0.137902     0.467026        0.466911      0.467023
jointDem1      0.111112     0.277754        0.277685      0.277752
logcapratio    0.035874     0.069134        0.069117      0.069133
contiguity     0.108454     0.255246        0.255183      0.255244
alpha          4.375289     9.472526        9.470192      9.472453

'''
	# -- coding: utf-8 --
	"""

	Created on Sun Dec 08 19:14:58 2013

	Author: Josef Perktold, based on example by John Beieler

	Not verified!

	"""
	import numpy as np
	import pandas as pd
	import statsmodels.api as sm
	from patsy import dmatrices
	from statsmodels.stats.sandwich_covariance import cov_cluster
	import statsmodels.stats.sandwich_covariance as sc

	dat_raw = pd.read_csv('negbin_full_data.csv')
	dat = dat_raw.dropna()
	#dat = dat_raw
	y, X = dmatrices('terrorCounts~rivalry+jointDem1+logcapratio+contiguity',
	data=dat, return_type='dataframe')

	res_poi = sm.Poisson(y, X, missing='drop').fit()

	# use estimated params to run faster during experimentation
	start_params = np.array([ -5.35096895, 2.34653792, 1.32354659,
	-0.15224597, 1.01714021, 64.78548562])

	mod = sm.NegativeBinomial(y, X, missing='drop')
	res0 = mod.fit(start_params=start_params, maxiter=60, method='bfgs')
	#start_params = np.array(res0.params)
	# see if newton works
	res = mod.fit(start_params=start_params, maxiter=60, method='newton')

	#check eigenvalues
	print np.linalg.eigvals(res0.cov_params())
	print np.linalg.eigvals(res.cov_params())

	print "\nbse non-robust 'newton'"
	print np.asarray(res.bse)

	bses = pd.DataFrame(res.bse, columns=['non-robust'])

	print "\nbse cluster robust 'newton'"

	# this is not working yet
	#clustered = cov_cluster(res, dat['dyadid'])

	# NegativeBinomial has jac as scoreobs based on approx_fprime_cs
	jac = res.model.scoreobs(np.array(res.params))


	# fake a results instance that has the required attributes

	class Holder(object):
	def __init__(self, **kwds):
	self.__dict__.update(kwds)

	model_fake = Holder(exog=jac)
	res_fake = Holder(model=model_fake, resid=np.ones(jac.shape[0]),
	normalized_cov_params=res.normalized_cov_params)


	gr = np.asarray(dat['dyadid'], int) # need for bincount, why not int64?
	gr -= gr.min()
	clustered = cov_cluster(res_fake, gr)
	bse_clu = np.sqrt(np.diag(clustered))
	print bse_clu


	# calculate directly
	S = sc.S_crosssection(jac, gr)
	# The next part is _HCCM, which shouldn't require results
	xxi = res.normalized_cov_params
	H = np.dot(np.dot(xxi, S), xxi.T)
	bse_clu2 = np.sqrt(np.diag(H))
	print bse_clu2

	#add correction
	nobs, k_vars = X.shape
	n_groups = len(np.unique(gr))
	corr_fact = n_groups / (n_groups - 1.) * ((nobs-1.) / float(nobs - k_vars))

	bse_clu3 = np.sqrt(np.diag(H * corr_fact))
	print bse_clu3
	# should be same as using fake



	bses['robust_corr'] = bse_clu
	bses['robust_no_corr'] = bse_clu2
	bses['robust_corr_'] = bse_clu3

	print '\n'
	print bses

	''' Results

	Note: "bfgs" ends up with singular Hessian, "newton" doesn't

	[ 4.84419405e+23 -2.99759693e+53 -6.64613998e+35 -1.12701301e+36
	9.00058591e+34 -1.86552394e+34]
	[ 1.91432527e+01 2.34646505e-02 1.52971738e-02 1.87594821e-03
	1.12834253e-03 9.22005732e-03]

	bse non-robust 'newton'
	[ 0.08170064 0.13790239 0.11111216 0.03587395 0.10845392 4.37528876]

	bse cluster robust 'newton'
	[ 0.18103571 0.46702637 0.27775385 0.06913378 0.25524573 9.47252558]
	[ 0.18099112 0.46691133 0.27768544 0.06911675 0.25518286 9.4701923 ]
	[ 0.18103433 0.4670228 0.27775173 0.06913326 0.25524379 9.4724533 ]


	non-robust robust_corr robust_no_corr robust_corr_
	Intercept 0.081701 0.181036 0.180991 0.181034
	rivalry 0.137902 0.467026 0.466911 0.467023
	jointDem1 0.111112 0.277754 0.277685 0.277752
	logcapratio 0.035874 0.069134 0.069117 0.069133
	contiguity 0.108454 0.255246 0.255183 0.255244
	alpha 4.375289 9.472526 9.470192 9.472453

	'''