{{ message }}

Instantly share code, notes, and snippets.

brentp/linear_model.py

Created Apr 10, 2013
calculate t statistics and p-values for coefficients in Linear Model in python, using scikit-learn framework.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 from sklearn import linear_model from scipy import stats import numpy as np class LinearRegression(linear_model.LinearRegression): """ LinearRegression class after sklearn's, but calculate t-statistics and p-values for model coefficients (betas). Additional attributes available after .fit() are `t` and `p` which are of the shape (y.shape, X.shape) which is (n_features, n_coefs) This class sets the intercept to 0 by default, since usually we include it in X. """ def __init__(self, *args, **kwargs): if not "fit_intercept" in kwargs: kwargs['fit_intercept'] = False super(LinearRegression, self)\ .__init__(*args, **kwargs) def fit(self, X, y, n_jobs=1): self = super(LinearRegression, self).fit(X, y, n_jobs) sse = np.sum((self.predict(X) - y) ** 2, axis=0) / float(X.shape - X.shape) se = np.array([ np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X.T, X)))) for i in range(sse.shape) ]) self.t = self.coef_ / se self.p = 2 * (1 - stats.t.cdf(np.abs(self.t), y.shape - X.shape)) return self

reed9999 commented Jul 10, 2015

I'm stuck using this because it fails on line 29 for i in range(sse.shape) with
IndexError: tuple index out of range

Problem seems to be that for me, sse has shape (), whereas it seems to be expecting a dimension. Perhaps related, I started with a one-dimension ndarray for my X (when I was using the base class LinearRegression) and had to do
X=district_data["my field name"][:,numpy.newaxis]
to get LinearRegression.fit() to work.

wildthingz commented Aug 15, 2016

or do this : se = np.array([np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(X.T, X))))])

nvergos commented May 8, 2017

Would this manipulation also work for extending scikit's Lasso?

derektsoimc commented Jun 6, 2017 • edited

Sorry I know it's really stupid, but I try to run the following code in my notebook and it didn't work

from sklearn import linear_model
from scipy import stats
import numpy as np
def __init__(self, *args, **kwargs):
if not "fit_intercept" in kwargs:
kwargs['fit_intercept'] = False
super(LinearRegression, self)\
.__init__(*args, **kwargs)

def fit(self, X, y, n_jobs=1):
self = super(LinearRegression, self).fit(X, y, n_jobs)

sse = np.sum((self.predict(X) - y) ** 2, axis=0) / float(X.shape - X.shape)
se = np.array([
np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X.T, X))))
for i in range(sse.shape)
])

self.t = self.coef_ / se
self.p = 2 * (1 - stats.t.cdf(np.abs(self.t), y.shape - X.shape))
return self
regr = LinearRegression()
regr.fit(X_train, y_train.to_frame())

It returns an error

RuntimeError: scikit-learn estimators should always specify their parameters in the signature of their __init__ (no varargs). <class '__main__.LinearRegression'> with constructor (self, *args, **kwargs) doesn't  follow this convention.
>

I've tried to google the baseEstimator of sklearn but I couldn't understand at all. Can anyone point out what mistake I have made?

catethos commented Jun 16, 2017

maybe you can change the init method into

def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
n_jobs=1):
self.fit_intercept = fit_intercept
self.normalize = normalize
self.copy_X = copy_X
self.n_jobs = n_jobs

avbelyaev commented Mar 13, 2019

it seems this code doesn't work, neighter original, nor with __init__method from above

gguntovitch commented Jan 4, 2021

In the code below I am running time-series regression for the last 36 periods. I can find coefficients through 'model.coef_' but how do I find p-value for each of these coefficients?

for c in range(36,167):
Port = df2.iloc[range(c-36,c),3]
Var_x = df2.iloc[range(c-36,c),[0,4,5]]
y = Port.to_numpy()
x = Var_x.to_numpy().reshape(-1,3)
model = LinearRegression()
model.fit(x, y)
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)
print(model.coef_)

codeteme commented Sep 30, 2021

In the code below I am running time-series regression for the last 36 periods. I can find coefficients through 'model.coef_' but how do I find p-value for each of these coefficients?

for c in range(36,167): Port = df2.iloc[range(c-36,c),3] Var_x = df2.iloc[range(c-36,c),[0,4,5]] y = Port.to_numpy() x = Var_x.to_numpy().reshape(-1,3) model = LinearRegression() model.fit(x, y) model = LinearRegression().fit(x, y) r_sq = model.score(x, y) print(model.coef_)

You can try the following:

from sklearn.feature_selection import f_regression

f_statistic, p_value = f_regression(X, y)