Skip to content

Instantly share code, notes, and snippets.

@bthirion
Last active December 15, 2015 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bthirion/5233416 to your computer and use it in GitHub Desktop.
Save bthirion/5233416 to your computer and use it in GitHub Desktop.
Experiment on gcv illustrating the problem with sklearn's gcv. This occurs when p > n: due to centering, the design matrix becomes rank deficient in the scikit implementation, which messes numerical aspects and makes the sklearn's selection really bad.
import numpy as np
from sklearn import linear_model
def _gcv(X, y, alphas):
"""Local gcv"""
# singular values of the design matrix
_, s, _ = np.linalg.svd(X, 0)
clf = linear_model.Ridge()
best_gcv = np.inf
for alpha in alphas:
dof = np.sum(s ** 2 / (s ** 2 + alpha)) # ridge degrees of freedom
clf.alpha = alpha
clf.fit(X, y)
sse = (1 - clf.score(X, y)) * np.var(y) * n1
gcv_ = sse / n1 / (1 - dof / n1) ** 2
if n1 > p:
M = np.dot(np.dot(
X, np.linalg.inv(np.dot(X.T, X) + alpha * np.eye(p))), X.T)
else:
K = np.dot(X, X.T)
M = (K - np.dot(np.dot(K, np.linalg.inv(K + np.eye(n1) * alpha)), K)) / alpha
gcv_ = np.sum(((y - clf.predict(X)) / (1 - np.diag(M))) ** 2)
if gcv_ < best_gcv:
best_gcv = gcv_
best_alpha = alpha
return best_alpha, best_gcv
# set the parameters
alphas = np.logspace(-2, 6, 9)
n1, p = 100, 200
# n1: number of training samples
# p: number of features
np.random.seed([2])
# generate the data
y, X, beta = (np.random.randn(n1), np.random.randn(n1, p), np.random.randn(p))
beta *= .1 # control the SNR
y += np.dot(X, beta)
alpha, _ = _gcv(X, y, alphas)
# try with sklearn's GCV
gcv1 = linear_model.RidgeCV(alphas=alphas).fit(X, y)
print 'sklearns, alpha: ', gcv1.alpha_, 'local implementation: ', alpha
gcv2 = linear_model.RidgeCV(alphas=[alpha]).fit(X, y)
# our estimate is actually much better than scikit learn's one
print 'error on beta: %f %f' % (np.sum((gcv1.coef_ - beta) ** 2),
np.sum((gcv2.coef_ - beta) ** 2))
# this can easily be fixed by not applying the centering of the matrix X
# sklearn.linear_model.ridge, line 564
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment