Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created December 30, 2011 16:04
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save ogrisel/1540431 to your computer and use it in GitHub Desktop.
Save ogrisel/1540431 to your computer and use it in GitHub Desktop.
Learning Curves for under/overfitting evaluation
"""Plot error curves for various dataset sizes
This plot emphasizes the fact that adding more data might help: by
following the trend, one can expect test score to further increase and
help reduce the overfit a bit as the slope of the test score curve is
still not null on the right hand side of the plot, especially when the prior /
regularizer parameter alpha is low.
Furthermore as the train error seems to remain pretty close to the perfect
score when the dataset size grows one can estimate that the model does not
underfit too much (is not too simplistic / biased).
See the following blog post for more details on how to interpret them:
http://digitheadslabnotebook.blogspot.com/2011/12/practical-advice-for-applying-machine.html
"""
import numpy as np
from time import time
from sklearn.cross_validation import ShuffleSplit
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import clone
def score_curves(clf_orig, X, y, n_runs=5, test_fraction=0.1,
train_fraction_range=np.linspace(0.1, 0.9, 10)):
n_datasets = train_fraction_range.shape[0]
training_score = np.zeros((n_datasets, n_runs))
test_score = np.zeros((n_datasets, n_runs))
training_time = np.zeros((n_datasets, n_runs))
for i, train_fraction in enumerate(train_fraction_range):
print "Train fraction: %0.2f" % train_fraction
cv = ShuffleSplit(n_samples, n_iterations=n_runs,
test_fraction=test_fraction,
train_fraction=train_fraction)
for j, (train, test) in enumerate(cv):
clf = clone(clf_orig)
t0 = time()
clf.fit(X[train], y[train])
training_time[i, j] = time() - t0
training_score[i, j] = clf.score(X[train], y[train])
test_score[i, j] = clf.score(X[test], y[test])
return training_score, test_score, training_time
if __name__ == "__main__":
data = fetch_20newsgroups_vectorized(subset='all')
categories = data.target_names
X = data.data
y = data.target
n_samples = y.shape[0]
n_features = X.shape[1]
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
clf = MultinomialNB(alpha=.01)
train_fraction_range = np.linspace(0.1, 0.9, 5)
test_fraction = 0.1
train_score, test_score, training_time = score_curves(
clf, X, y, train_fraction_range=train_fraction_range,
test_fraction=test_fraction)
print clf
mean_test_score = test_score.mean(axis=1)
mean_train_score = train_score.mean(axis=1)
gap = np.abs(mean_test_score - mean_train_score)
print "Best test score: %0.2f" % mean_test_score.max()
print "Gap at train_fraction=%0.2f: %0.2f" % (
train_fraction_range[-1], gap[-1])
import pylab as pl
plots = []
plots.append(pl.errorbar(train_fraction_range,
mean_train_score,
train_score.std(axis=1)))
plots.append(pl.errorbar(train_fraction_range,
mean_test_score,
test_score.std(axis=1)))
pl.legend(plots, ('train', 'test'), loc='lower right')
pl.title("Learning curves for %r\n"
"Best test score: %0.2f - Gap: %0.2f" %
(clf, mean_test_score.max(), gap[-1]))
pl.ylim(0.0, 1.0)
pl.ylabel('Classification score')
pl.xlabel('Fraction of the dataset used for training\n'
'The test fraction is fixed to %0.2f' % test_fraction)
pl.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment