Last active
December 20, 2015 10:19
-
-
Save GaelVaroquaux/6114603 to your computer and use it in GitHub Desktop.
Benchmarking scikit_learn 0.14.X release
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
from sklearn import cluster | |
from sklearn import datasets | |
lfw = datasets.fetch_lfw_people() | |
X_lfw = lfw.data[:, :5] | |
eps = 8. # This choice of EPS gives 44 clusters | |
# Using only 30000 samples, as above the 0.13 release runs out of memory | |
X_blobs, y = datasets.make_blobs(n_samples=30000, centers=12, n_features=10, | |
random_state=42) | |
eps = 2 | |
for X, eps in [(X_blobs, 2.), (X_lfw, 8.)]: | |
times = [] | |
for i in range(10): | |
t0 = time.time() | |
clf = cluster.DBSCAN(eps=eps) | |
clf.fit(X) | |
times.append(time.time() - t0) | |
# remove extremal values | |
times.pop(np.argmax(times)) | |
times.pop(np.argmin(times)) | |
times = np.array(times) | |
print 'DBSCAN time mean %.2f, std %.2f' % (np.mean(times), np.std(times)) | |
print '%s samples with %s features' % X.shape | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
from sklearn import neighbors | |
from sklearn import datasets | |
from sklearn import decomposition | |
from sklearn import cross_validation | |
from scipy import spatial | |
lfw = datasets.fetch_lfw_people() | |
trees = [('scipy cKDTree', spatial.cKDTree), | |
('sklearn KDTree', neighbors.KDTree), | |
('sklearn BallTree', neighbors.BallTree), | |
] | |
from sklearn.datasets import fetch_mldata | |
mnist = fetch_mldata('MNIST original') | |
for n_features in [3, 10, 30, 100, 200]: | |
for dataset in ('mnist', 'little make_blobs', 'make_blobs', 'lfw'): | |
if dataset == 'lfw': | |
X = decomposition.RandomizedPCA( | |
n_components=n_features).fit_transform(lfw.data) | |
elif dataset == 'mnist': | |
X = decomposition.RandomizedPCA( | |
n_components=n_features).fit_transform(mnist.data) | |
else: | |
if datasets == 'little make_blobs': | |
n_samples = int(1e5) | |
else: | |
n_samples = int(1e3) | |
X, _ = datasets.make_blobs(n_samples=n_samples, centers=12, | |
n_features=n_features, random_state=42) | |
print '\n%s: %s samples with %s features' % (dataset, X.shape[0], | |
X.shape[1]) | |
for tree_name, Tree in trees: | |
train_times = [] | |
test_times = [] | |
for train, test in cross_validation.ShuffleSplit(len(X), n_iter=10, | |
random_state=0): | |
X_train = X[train] | |
X_test = X[test] | |
t0 = time.time() | |
tree = Tree(X_train) | |
train_times.append(time.time() - t0) | |
t0 = time.time() | |
tree.query(X_test) | |
test_times.append(time.time() - t0) | |
# remove extremal values | |
train_times.pop(np.argmax(train_times)) | |
train_times.pop(np.argmin(train_times)) | |
train_times = np.array(train_times) | |
test_times.pop(np.argmax(test_times)) | |
test_times.pop(np.argmin(test_times)) | |
test_times = np.array(test_times) | |
print '%s train time %.4fs (%.4fs) / test time %.4fs (%.4fs)' % ( | |
tree_name, np.mean(train_times), | |
np.std(train_times), np.mean(test_times), | |
np.std(test_times)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment