Skip to content

Instantly share code, notes, and snippets.

@rspeare
Last active February 4, 2024 02:50
Show Gist options
  • Star 28 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save rspeare/77061e6e317896be29c6de9a85db301d to your computer and use it in GitHub Desktop.
Save rspeare/77061e6e317896be29c6de9a85db301d to your computer and use it in GitHub Desktop.
P values for sklearn logistic regression
from sklearn import linear_model
import numpy as np
import scipy.stats as stat
class LogisticReg:
"""
Wrapper Class for Logistic Regression which has the usual sklearn instance
in an attribute self.model, and pvalues, z scores and estimated
errors for each coefficient in
self.z_scores
self.p_values
self.sigma_estimates
as well as the negative hessian of the log Likelihood (Fisher information)
self.F_ij
"""
def __init__(self,*args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient
p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values
self.z_scores = z_scores
self.p_values = p_values
self.sigma_estimates = sigma_estimates
self.F_ij = F_ij
@rspeare
Copy link
Author

rspeare commented May 11, 2020

Hey @dncortez! really excited to hear you used this for research in genomics! No need to cite this gist, perhaps an older source / paper on p-values for generalized linear models would be a good reference

-Rob

@dini437
Copy link

dini437 commented Jul 12, 2020

I have tried to use your code, but I do get errors: I have the whole codes and error shown below.

inputs_train.shape
(373028, 104)

loan_data_targets_train.shape
(373028, 1)

Now fitting it with p-values

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

class LogisticRegression_with_p_values: # this is a new class of reg

def init(self,*args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values
self.coef_ = self.model.coef_
self.intercept_ = self.model.intercept_
self.p_values = p_values # p values are store in a variable called p value
reg = LogisticRegression_with_p_values()
reg.fit(inputs_train, loan_data_targets_train)

RESULTS

LinAlgError Traceback (most recent call last)
in
----> 1 reg.fit(inputs_train, loan_data_targets_train)
2 # Estimates the coefficients of the object from the 'LogisticRegression' class
3 # with inputs (independent variables) contained in the first dataframe
4 # and targets (dependent variables) contained in the second dataframe.

in fit(self, X, y)
20 denom = np.tile(denom,(X.shape[1],1)).T
21 F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
---> 22 Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
23 sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
24 z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient

~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in inv(a)
549 signature = 'D->D' if isComplexType(t) else 'd->d'
550 extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 551 ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
552 return wrap(ainv.astype(result_t, copy=False))
553

~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_singular(err, flag)
95
96 def _raise_linalgerror_singular(err, flag):
---> 97 raise LinAlgError("Singular matrix")
98
99 def _raise_linalgerror_nonposdef(err, flag):

LinAlgError: Singular matrix

@dini437
Copy link

dini437 commented Jul 12, 2020

Hi @rspeare , I tried the following code as per your guidance.

eps=1e-4
F_ij = np.dot((X / denom).T,X) + np.eye(F_ij.shape[0])*eps ## Fisher Information Matrix

but now I am having unbound local error: local variable 'F_ij' referenced before assignment.
Can you please help me on this. Thanks

@biohouston
Copy link

I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.

class MNLogisticReg(linear_model.LogisticRegression):
    
    def __init__(self, *args,**kwargs):#,**kwargs):
        self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
        if 'fit_intercept' in kwargs.keys():           
            self._fit_intercept = kwargs['fit_intercept']

    def fit(self,X,y):
        self.model.fit(X,y)
        #### Get p-values for the fitted model ####
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        p_values = []
        z_scores = []
        self.columns = list(X.columns)

        if self._fit_intercept:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
           
        for i in range(denom.shape[1]):
            d = denom[:,i]        
            
            if self._fit_intercept:
                self.coef = np.column_stack((self.model.intercept_, self.model.coef_))
            else:
                self.coef = self.model.coef_
            
            d = np.tile(d,(X.shape[1],1)).T
            F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix
            Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix  
            sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
            z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient
            z_scores.append(z_score)
            p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values
            p_values.append(p_vals)
            
        self.z_scores = np.array(z_scores)
        self.p_values = np.array(p_values)
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_ij

    # A function to create an output in form of pandas dataframe, with regressors and intercept in
    # rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each
    # pair of classes in the dependent variable
    
    def printstats(self):      
        data = None
        for i in range(self.coef.shape[0]):
            if data is None:
                data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
            else:
                d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
                data = np.hstack((data,d0))
        # data is reshaped in the correct order
        regr = []
        for item in list(itertools.combinations(list(dep_acute.unique()), 2)):
            regr.append('{} vs {}'.format(item[0], item[1]))
            
        functions = ['coef', 'P-value', 'Z-score']
        column_names = [([i] + [j]) for i in regr for j in functions] 
        index = pd.MultiIndex.from_tuples(column_names)
        predictors = self.columns
        if self._fit_intercept:
            ind = ['intercept'] + predictors
        else:
            ind = predictors
        self.stats = pd.DataFrame(data, columns = index, index = ind)
        return self.stats

Here's an example of output:
image

@Green-Guo
Copy link

@biohouston Thanks a whole lot this was super helpful. Also I think this is calling a special implementation of multinomial? I did linear_model.LogisticRegression(multi_class='multinomial', max_iter=500, solver='lbfgs') and only got n series of coefs/p-values (n = groups of y classes) but your awesome code seems to generate C(n, 2) series? :)

I edited code a bit so it now can work with multinomial logistic regression. The extended class should calculate statistics for each pair of classes of the dependent variable.
I also added a function to output the results of calculation properly.

class MNLogisticReg(linear_model.LogisticRegression):
    
    def __init__(self, *args,**kwargs):#,**kwargs):
        self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
        if 'fit_intercept' in kwargs.keys():           
            self._fit_intercept = kwargs['fit_intercept']

    def fit(self,X,y):
        self.model.fit(X,y)
        #### Get p-values for the fitted model ####
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        p_values = []
        z_scores = []
        self.columns = list(X.columns)

        if self._fit_intercept:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
           
        for i in range(denom.shape[1]):
            d = denom[:,i]        
            
            if self._fit_intercept:
                self.coef = np.column_stack((self.model.intercept_, self.model.coef_))
            else:
                self.coef = self.model.coef_
            
            d = np.tile(d,(X.shape[1],1)).T
            F_ij = np.dot((X/d).T,X) ## Fisher Information Matrix
            Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix  
            sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
            z_score = (self.coef[i]/sigma_estimates) # z-score for each model coefficient
            z_scores.append(z_score)
            p_vals = [stat.norm.sf(abs(i))*2 for i in z_score] ### two tailed test for p-values
            p_values.append(p_vals)
            
        self.z_scores = np.array(z_scores)
        self.p_values = np.array(p_values)
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_ij

    # A function to create an output in form of pandas dataframe, with regressors and intercept in
    # rows and coefficients in columns. Coefficients, p-values and z-scores are calculated for each
    # pair of classes in the dependent variable
    
    def printstats(self):      
        data = None
        for i in range(self.coef.shape[0]):
            if data is None:
                data = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
            else:
                d0 = np.vstack(( self.coef[i,:], self.p_values[i,:], self.z_scores[i,:])).T
                data = np.hstack((data,d0))
        # data is reshaped in the correct order
        regr = []
        for item in list(itertools.combinations(list(dep_acute.unique()), 2)):
            regr.append('{} vs {}'.format(item[0], item[1]))
            
        functions = ['coef', 'P-value', 'Z-score']
        column_names = [([i] + [j]) for i in regr for j in functions] 
        index = pd.MultiIndex.from_tuples(column_names)
        predictors = self.columns
        if self._fit_intercept:
            ind = ['intercept'] + predictors
        else:
            ind = predictors
        self.stats = pd.DataFrame(data, columns = index, index = ind)
        return self.stats

Here's an example of output:
image

@JackCollins1991
Copy link

Hi @biohousten , your code was very helpful, but I am getting an error claiming 'dep_acute' is not defined, can you help me understand where this variable comes from? Cheers

@DavidBrown-RRT
Copy link

It boggles my mind that it is so freaking complicated to get a p-value in sklearn. My God, what is the freaking point of doing logistic regression if you can't assess whether your p-values are significant or not? Why is this not a basic function argument?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment