Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
P values for sklearn logistic regression
from sklearn import linear_model
import numpy as np
import scipy.stats as stat
class LogisticReg:
"""
Wrapper Class for Logistic Regression which has the usual sklearn instance
in an attribute self.model, and pvalues, z scores and estimated
errors for each coefficient in
self.z_scores
self.p_values
self.sigma_estimates
as well as the negative hessian of the log Likelihood (Fisher information)
self.F_ij
"""
def __init__(self,*args,**kwargs):#,**kwargs):
self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
def fit(self,X,y):
self.model.fit(X,y)
#### Get p-values for the fitted model ####
denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix
Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient
p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values
self.z_scores = z_scores
self.p_values = p_values
self.sigma_estimates = sigma_estimates
self.F_ij = F_ij
@rizzomichaelg

This comment has been minimized.

Copy link

rizzomichaelg commented Nov 24, 2017

Thanks for posting this! I'm wondering if, for the Fisher info matrix calculation, the denom should be tiled first because I'm running into problems of division on arrays of different shape. Was thinking something like this:

denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix

I believe sigma_estimates can be condensed to: sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)).
I also extended this to include confidence intervals for each of the params (similar to how statsmodels does it):

alpha = 0.05
q = stats.norm.ppf(1 - alpha / 2)
lower = self.model.coef_[0] - q * sigma_estimates
upper = self.model.coef_[0] + q * sigma_estimates
self.conf_int = np.dstack((lower, upper))[0]

Thanks again for this!

@MiloVentimiglia

This comment has been minimized.

Copy link

MiloVentimiglia commented Sep 28, 2018

Hi,

Would you be able to provide any documentation/ book/article that served as base for this code?
I don't quite understand why to calculate the "denom" you apply a hyperbolic cosine to the scores.

Thanks in advance!

@rspeare

This comment has been minimized.

Copy link
Owner Author

rspeare commented Jan 13, 2019

Hey @rizzomichaelg, thanks so much for the comments. Put the changes in above. @MiloVentimiglia, you'll see that Cosh just comes from the Hessian of the binomial likelihood for logistic regression. (A little tricky but all Generalized linear models have a fisher information matrix of the form X.D.X^T, where X is the data matrix and D is some intermediary -- normally diagonal and in this case it's our cosh function)

@Mikeal001

This comment has been minimized.

Copy link

Mikeal001 commented Aug 16, 2019

I have tried to use your code, but I do get errors: I have the whole codes and error shown below.

inputs_train.shape
(373028, 104)

loan_data_targets_train.shape
(373028, 1)

Now fitting it with p-values

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

class LogisticRegression_with_p_values: # this is a new class of reg

def __init__(self,*args,**kwargs):#,**kwargs):
    self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)
 def fit(self,X,y):
    self.model.fit(X,y)        
    #### Get p-values for the fitted model ####
    denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
    denom = np.tile(denom,(X.shape[1],1)).T
    F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
    Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
    sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
    z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
    p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values    
    self.coef_ = self.model.coef_
    self.intercept_ = self.model.intercept_
    self.p_values = p_values  # p values are store in a variable called p value

reg = LogisticRegression_with_p_values()
reg.fit(inputs_train, loan_data_targets_train)

RESULTS

LinAlgError Traceback (most recent call last)
in
----> 1 reg.fit(inputs_train, loan_data_targets_train)
2 # Estimates the coefficients of the object from the 'LogisticRegression' class
3 # with inputs (independent variables) contained in the first dataframe
4 # and targets (dependent variables) contained in the second dataframe.

in fit(self, X, y)
20 denom = np.tile(denom,(X.shape[1],1)).T
21 F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
---> 22 Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
23 sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
24 z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient

~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in inv(a)
549 signature = 'D->D' if isComplexType(t) else 'd->d'
550 extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 551 ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
552 return wrap(ainv.astype(result_t, copy=False))
553

~\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in _raise_linalgerror_singular(err, flag)
95
96 def _raise_linalgerror_singular(err, flag):
---> 97 raise LinAlgError("Singular matrix")
98
99 def _raise_linalgerror_nonposdef(err, flag):

LinAlgError: Singular matrix

@anaclaramatos

This comment has been minimized.

Copy link

anaclaramatos commented Aug 26, 2019

When fit_intercept = True, shouldn't a column of ones be added to x?

def fit(self, x, y):
        self.model.fit(x, y)

        denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(x))))

        if self._fit_intercept:
            x = np.hstack([np.ones((x.shape[0], 1)), x])

        denom = np.tile(denom, (x.shape[1], 1)).T

        f_ij = np.dot((x / denom).T, x)  ## Fisher Information Matrix
        cramer_rao = np.linalg.inv(f_ij)  ## Inverse Information Matrix

        if self._fit_intercept:
            self.coef = np.column_stack((self.model.intercept_, self.model.coef_))
        else:
            self.coef = self.model.coef_

        self.sigma = np.sqrt(np.diagonal(cramer_rao))
        self.z = (self.coef / self.sigma)[0]
        self.p = (np.round([stat.norm.sf(abs(x)) * 2 for x in self.z], 3))
@PescheHelfer

This comment has been minimized.

Copy link

PescheHelfer commented Nov 3, 2019

It's quite possible I am doing something wrong, but I can't do predictions using the wrapper method because it does not know the method predict(). I tried to get around this problem by implementing inheritance, but failed miserably. Does anybody have an idea, what I am doing wrong?

If I use "self.model.fit(X,y)", I get the error

"This LogisticRegressionExtended instance is not fitted yet".

If I try to invoke the fit method in the base class by using "super().fit(X, y)", I get

`C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\envs\py36GPU\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1198 Returns self.
1199 """
-> 1200 if not isinstance(self.C, numbers.Number) or self.C < 0:
1201 raise ValueError("Penalty term must be positive; got (C=%r)"
1202 % self.C)

AttributeError: 'LogisticRegressionExtended' object has no attribute 'C'`

When trying "super().model.fit(X, y)", I get

"'super' object has no attribute 'model'".

...
from sklearn import linear_model
import scipy.stats as stat
...
class LogisticRegressionExtended(linear_model.LogisticRegression):

    def __init__(self,*args,**kwargs):#,**kwargs):
        #self.model = LogisticRegression(*args,**kwargs)#,**args)
        super().__init__(*args, **kwargs)

    def fit(self,X,y):
        #self.model.fit(X,y)
        #super().fit(X, y)
        super()model.fit(X, y)
 ...    

Thanks for any help :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.