from sklearn import linear_model | |
import numpy as np | |
import scipy.stats as stat | |
class LogisticReg: | |
""" | |
Wrapper Class for Logistic Regression which has the usual sklearn instance | |
in an attribute self.model, and pvalues, z scores and estimated | |
errors for each coefficient in | |
self.z_scores | |
self.p_values | |
self.sigma_estimates | |
as well as the negative hessian of the log Likelihood (Fisher information) | |
self.F_ij | |
""" | |
def __init__(self,*args,**kwargs):#,**kwargs): | |
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args) | |
def fit(self,X,y): | |
self.model.fit(X,y) | |
#### Get p-values for the fitted model #### | |
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X)))) | |
denom = np.tile(denom,(X.shape[1],1)).T | |
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix | |
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix | |
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) | |
z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient | |
p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values | |
self.z_scores = z_scores | |
self.p_values = p_values | |
self.sigma_estimates = sigma_estimates | |
self.F_ij = F_ij | |
Hey @Akanksha594 and @wkangong and @Mikeal001 :
One thing to try is adding a tiny amount to the diagonal of the matrix before inversion, e.g:
eps=1e-4
F_ij = np.dot((X / denom).T,X) + np.eye(F_ij.shape[0])*eps ## Fisher Information Matrix
Hi, Rspeare! Thank you so much for your code. I used it in a research in comparative genomics and now I'm in the process of writing the paper for publishing. I want to know if you're ok with me citing this code directly with you as author or if you prefer me to cite the original method on logistic regression and the binomial likehood.
Cheers, Diego.
Hey @dncortez! really excited to hear you used this for research in genomics! No need to cite this gist, perhaps an older source / paper on p-values for generalized linear models would be a good reference
-Rob
I have tried to use your code, but I do get errors: I have the whole codes and error shown below.
inputs_train.shape
(373028, 104)
loan_data_targets_train.shape
(373028, 1)
Now fitting it with p-values
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
class LogisticRegression_with_p_values: # this is a new class of reg
def init(self,*args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values
self.coef_ = self.model.coef_
self.intercept_ = self.model.intercept_
self.p_values = p_values # p values are store in a variable called p value
reg = LogisticRegression_with_p_values()
reg.fit(inputs_train, loan_data_targets_train)
RESULTS
LinAlgError Traceback (most recent call last)
in
----> 1 reg.fit(inputs_train, loan_data_targets_train)
2 # Estimates the coefficients of the object from the 'LogisticRegression' class
3 # with inputs (independent variables) contained in the first dataframe
4 # and targets (dependent variables) contained in the second dataframe.
in fit(self, X, y)
20 denom = np.tile(denom,(X.shape[1],1)).T
21 F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
---> 22 Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
23 sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
24 z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in inv(a)
549 signature = 'D->D' if isComplexType(t) else 'd->d'
550 extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 551 ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
552 return wrap(ainv.astype(result_t, copy=False))
553
~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_singular(err, flag)
95
96 def _raise_linalgerror_singular(err, flag):
---> 97 raise LinAlgError("Singular matrix")
98
99 def _raise_linalgerror_nonposdef(err, flag):
LinAlgError: Singular matrix
Hi @rspeare , I tried the following code as per your guidance.
eps=1e-4
F_ij = np.dot((X / denom).T,X) + np.eye(F_ij.shape[0])*eps ## Fisher Information Matrix
but now I am having unbound local error: local variable 'F_ij' referenced before assignment.
Can you please help me on this. Thanks
I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.
class MNLogisticReg(linear_model.LogisticRegression):
def __init__(self, *args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
if 'fit_intercept' in kwargs.keys():
self._fit_intercept = kwargs['fit_intercept']
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
p_values = []
z_scores = []
self.columns = list(X.columns)
if self._fit_intercept:
X = np.hstack([np.ones((X.shape[0], 1)), X])
for i in range(denom.shape[1]):
d = denom[:,i]
if self._fit_intercept:
self.coef = np.column_stack((self.model.intercept_, self.model.coef_))
else:
self.coef = self.model.coef_
d = np.tile(d,(X.shape[1],1)).T
F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient
z_scores.append(z_score)
p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values
p_values.append(p_vals)
self.z_scores = np.array(z_scores)
self.p_values = np.array(p_values)
self.sigma_estimates = sigma_estimates
self.F_ij = F_ij
# A function to create an output in form of pandas dataframe, with regressors and intercept in
# rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each
# pair of classes in the dependent variable
def printstats(self):
data = None
for i in range(self.coef.shape[0]):
if data is None:
data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
else:
d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
data = np.hstack((data,d0))
# data is reshaped in the correct order
regr = []
for item in list(itertools.combinations(list(dep_acute.unique()), 2)):
regr.append('{} vs {}'.format(item[0], item[1]))
functions = ['coef', 'P-value', 'Z-score']
column_names = [([i] + [j]) for i in regr for j in functions]
index = pd.MultiIndex.from_tuples(column_names)
predictors = self.columns
if self._fit_intercept:
ind = ['intercept'] + predictors
else:
ind = predictors
self.stats = pd.DataFrame(data, columns = index, index = ind)
return self.stats
@biohouston Thanks a whole lot this was super helpful. Also I think this is calling a special implementation of multinomial? I did linear_model.LogisticRegression(multi_class='multinomial', max_iter=500, solver='lbfgs') and only got n series of coefs/p-values (n = groups of y classes) but your awesome code seems to generate C(n, 2) series? :)
I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.class MNLogisticReg(linear_model.LogisticRegression): def __init__(self, *args,**kwargs):#,**kwargs): self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args) if 'fit_intercept' in kwargs.keys(): self._fit_intercept = kwargs['fit_intercept'] def fit(self,X,y): self.model.fit(X,y) #### Get p-values for the fitted model #### denom = (2.0*(1.0+np.cosh(self.model.decision_function(X)))) p_values = [] z_scores = [] self.columns = list(X.columns) if self._fit_intercept: X = np.hstack([np.ones((X.shape[0], 1)), X]) for i in range(denom.shape[1]): d = denom[:,i] if self._fit_intercept: self.coef = np.column_stack((self.model.intercept_, self.model.coef_)) else: self.coef = self.model.coef_ d = np.tile(d,(X.shape[1],1)).T F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient z_scores.append(z_score) p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values p_values.append(p_vals) self.z_scores = np.array(z_scores) self.p_values = np.array(p_values) self.sigma_estimates = sigma_estimates self.F_ij = F_ij # A function to create an output in form of pandas dataframe, with regressors and intercept in # rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each # pair of classes in the dependent variable def printstats(self): data = None for i in range(self.coef.shape[0]): if data is None: data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T else: d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T data = np.hstack((data,d0)) # data is reshaped in the correct order regr = [] for item in list(itertools.combinations(list(dep_acute.unique()), 2)): regr.append('{} vs {}'.format(item[0], item[1])) functions = ['coef', 'P-value', 'Z-score'] column_names = [([i] + [j]) for i in regr for j in functions] index = pd.MultiIndex.from_tuples(column_names) predictors = self.columns if self._fit_intercept: ind = ['intercept'] + predictors else: ind = predictors self.stats = pd.DataFrame(data, columns = index, index = ind) return self.stats
Hi @biohousten , your code was very helpful, but I am getting an error claiming 'dep_acute' is not defined, can you help me understand where this variable comes from? Cheers
It boggles my mind that it is so freaking complicated to get a p-value in sklearn. My God, what is the freaking point of doing logistic regression if you can't assess whether your p-values are significant or not? Why is this not a basic function argument?
I am also having the same problem with the same dataset which @Mikeal001 is getting. Please provide me solution for this error.
Thanks