-
-
Save rspeare/77061e6e317896be29c6de9a85db301d to your computer and use it in GitHub Desktop.
from sklearn import linear_model | |
import numpy as np | |
import scipy.stats as stat | |
class LogisticReg: | |
""" | |
Wrapper Class for Logistic Regression which has the usual sklearn instance | |
in an attribute self.model, and pvalues, z scores and estimated | |
errors for each coefficient in | |
self.z_scores | |
self.p_values | |
self.sigma_estimates | |
as well as the negative hessian of the log Likelihood (Fisher information) | |
self.F_ij | |
""" | |
def __init__(self,*args,**kwargs):#,**kwargs): | |
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args) | |
def fit(self,X,y): | |
self.model.fit(X,y) | |
#### Get p-values for the fitted model #### | |
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X)))) | |
denom = np.tile(denom,(X.shape[1],1)).T | |
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix | |
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix | |
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) | |
z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient | |
p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values | |
self.z_scores = z_scores | |
self.p_values = p_values | |
self.sigma_estimates = sigma_estimates | |
self.F_ij = F_ij | |
I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.
class MNLogisticReg(linear_model.LogisticRegression):
def __init__(self, *args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
if 'fit_intercept' in kwargs.keys():
self._fit_intercept = kwargs['fit_intercept']
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
p_values = []
z_scores = []
self.columns = list(X.columns)
if self._fit_intercept:
X = np.hstack([np.ones((X.shape[0], 1)), X])
for i in range(denom.shape[1]):
d = denom[:,i]
if self._fit_intercept:
self.coef = np.column_stack((self.model.intercept_, self.model.coef_))
else:
self.coef = self.model.coef_
d = np.tile(d,(X.shape[1],1)).T
F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient
z_scores.append(z_score)
p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values
p_values.append(p_vals)
self.z_scores = np.array(z_scores)
self.p_values = np.array(p_values)
self.sigma_estimates = sigma_estimates
self.F_ij = F_ij
# A function to create an output in form of pandas dataframe, with regressors and intercept in
# rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each
# pair of classes in the dependent variable
def printstats(self):
data = None
for i in range(self.coef.shape[0]):
if data is None:
data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
else:
d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
data = np.hstack((data,d0))
# data is reshaped in the correct order
regr = []
for item in list(itertools.combinations(list(dep_acute.unique()), 2)):
regr.append('{} vs {}'.format(item[0], item[1]))
functions = ['coef', 'P-value', 'Z-score']
column_names = [([i] + [j]) for i in regr for j in functions]
index = pd.MultiIndex.from_tuples(column_names)
predictors = self.columns
if self._fit_intercept:
ind = ['intercept'] + predictors
else:
ind = predictors
self.stats = pd.DataFrame(data, columns = index, index = ind)
return self.stats
@biohouston Thanks a whole lot this was super helpful. Also I think this is calling a special implementation of multinomial? I did linear_model.LogisticRegression(multi_class='multinomial', max_iter=500, solver='lbfgs') and only got n series of coefs/p-values (n = groups of y classes) but your awesome code seems to generate C(n, 2) series? :)
I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.class MNLogisticReg(linear_model.LogisticRegression): def __init__(self, *args,**kwargs):#,**kwargs): self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args) if 'fit_intercept' in kwargs.keys(): self._fit_intercept = kwargs['fit_intercept'] def fit(self,X,y): self.model.fit(X,y) #### Get p-values for the fitted model #### denom = (2.0*(1.0+np.cosh(self.model.decision_function(X)))) p_values = [] z_scores = [] self.columns = list(X.columns) if self._fit_intercept: X = np.hstack([np.ones((X.shape[0], 1)), X]) for i in range(denom.shape[1]): d = denom[:,i] if self._fit_intercept: self.coef = np.column_stack((self.model.intercept_, self.model.coef_)) else: self.coef = self.model.coef_ d = np.tile(d,(X.shape[1],1)).T F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient z_scores.append(z_score) p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values p_values.append(p_vals) self.z_scores = np.array(z_scores) self.p_values = np.array(p_values) self.sigma_estimates = sigma_estimates self.F_ij = F_ij # A function to create an output in form of pandas dataframe, with regressors and intercept in # rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each # pair of classes in the dependent variable def printstats(self): data = None for i in range(self.coef.shape[0]): if data is None: data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T else: d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T data = np.hstack((data,d0)) # data is reshaped in the correct order regr = [] for item in list(itertools.combinations(list(dep_acute.unique()), 2)): regr.append('{} vs {}'.format(item[0], item[1])) functions = ['coef', 'P-value', 'Z-score'] column_names = [([i] + [j]) for i in regr for j in functions] index = pd.MultiIndex.from_tuples(column_names) predictors = self.columns if self._fit_intercept: ind = ['intercept'] + predictors else: ind = predictors self.stats = pd.DataFrame(data, columns = index, index = ind) return self.stats
Hi @biohousten , your code was very helpful, but I am getting an error claiming 'dep_acute' is not defined, can you help me understand where this variable comes from? Cheers
It boggles my mind that it is so freaking complicated to get a p-value in sklearn. My God, what is the freaking point of doing logistic regression if you can't assess whether your p-values are significant or not? Why is this not a basic function argument?
Hi @rspeare , I tried the following code as per your guidance.
but now I am having unbound local error: local variable 'F_ij' referenced before assignment.
Can you please help me on this. Thanks