Created
October 14, 2015 14:32
-
-
Save GaelVaroquaux/2465c9f3421a393b785f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.linear_model import Ridge, Lasso | |
from sklearn.cross_validation import ShuffleSplit | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.utils import check_random_state | |
from sklearn import datasets | |
rnd = check_random_state(1) | |
# set up dataset | |
n_samples = 200 | |
n_features = 300 | |
# l1 data (only 5 informative features) | |
X_1, y_1 = datasets.make_classification(n_samples=n_samples, | |
n_features=n_features, n_informative=5, | |
random_state=1) | |
# l2 data: non sparse, but less features | |
y_2 = np.sign(.5 - rnd.rand(n_samples)) | |
X_2 = rnd.randn(n_samples, n_features / 5) + y_2[:, np.newaxis] | |
X_2 += 5 * rnd.randn(n_samples, n_features / 5) | |
clf_sets = [(Ridge(), | |
np.logspace(0, 4, 20), X_1, y_1), | |
(Lasso(), | |
np.logspace(-2, 1, 20), X_2, y_2)] | |
colors = ['b', 'g', 'r', 'c'] | |
for fignum, (clf, cs, X, y) in enumerate(clf_sets): | |
# set up the plot for each regressor | |
plt.figure(fignum, figsize=(9, 10)) | |
for k, train_size in enumerate(np.linspace(0.15, 0.7, 4)[::-1]): | |
param_grid = dict(alpha=cs) | |
# To get nice curve, we need a large number of iterations to | |
# reduce the variance | |
grid = GridSearchCV(clf, refit=False, param_grid=param_grid, | |
cv=ShuffleSplit(n=n_samples, train_size=train_size, | |
n_iter=250, random_state=1), | |
n_jobs=-1) | |
grid.fit(X, y) | |
scores = [x[1] for x in grid.grid_scores_] | |
scales = [(1, 'No scaling'), | |
((n_samples * train_size), 'n_samples'), | |
] | |
for subplotnum, (scaler, name) in enumerate(scales): | |
plt.subplot(2, 1, subplotnum + 1) | |
plt.xlabel('alpha') | |
plt.ylabel('CV Score') | |
grid_cs = cs * float(scaler) # scale the C's | |
plt.semilogx(grid_cs, scores, label="fraction %.2f" % | |
train_size) | |
plt.title('scaling=%s, %s' % | |
(name, clf.__class__.__name__)) | |
plt.legend(loc="best") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment