Skip to content

Instantly share code, notes, and snippets.

@GaelVaroquaux
Last active December 20, 2015 10:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GaelVaroquaux/6114603 to your computer and use it in GitHub Desktop.
Save GaelVaroquaux/6114603 to your computer and use it in GitHub Desktop.
Benchmarking scikit_learn 0.14.X release
import numpy as np
import time
from sklearn import cluster
from sklearn import datasets
lfw = datasets.fetch_lfw_people()
X_lfw = lfw.data[:, :5]
eps = 8. # This choice of EPS gives 44 clusters
# Using only 30000 samples, as above the 0.13 release runs out of memory
X_blobs, y = datasets.make_blobs(n_samples=30000, centers=12, n_features=10,
random_state=42)
eps = 2
for X, eps in [(X_blobs, 2.), (X_lfw, 8.)]:
times = []
for i in range(10):
t0 = time.time()
clf = cluster.DBSCAN(eps=eps)
clf.fit(X)
times.append(time.time() - t0)
# remove extremal values
times.pop(np.argmax(times))
times.pop(np.argmin(times))
times = np.array(times)
print 'DBSCAN time mean %.2f, std %.2f' % (np.mean(times), np.std(times))
print '%s samples with %s features' % X.shape
import numpy as np
import time
from sklearn import neighbors
from sklearn import datasets
from sklearn import decomposition
from sklearn import cross_validation
from scipy import spatial
lfw = datasets.fetch_lfw_people()
trees = [('scipy cKDTree', spatial.cKDTree),
('sklearn KDTree', neighbors.KDTree),
('sklearn BallTree', neighbors.BallTree),
]
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
for n_features in [3, 10, 30, 100, 200]:
for dataset in ('mnist', 'little make_blobs', 'make_blobs', 'lfw'):
if dataset == 'lfw':
X = decomposition.RandomizedPCA(
n_components=n_features).fit_transform(lfw.data)
elif dataset == 'mnist':
X = decomposition.RandomizedPCA(
n_components=n_features).fit_transform(mnist.data)
else:
if datasets == 'little make_blobs':
n_samples = int(1e5)
else:
n_samples = int(1e3)
X, _ = datasets.make_blobs(n_samples=n_samples, centers=12,
n_features=n_features, random_state=42)
print '\n%s: %s samples with %s features' % (dataset, X.shape[0],
X.shape[1])
for tree_name, Tree in trees:
train_times = []
test_times = []
for train, test in cross_validation.ShuffleSplit(len(X), n_iter=10,
random_state=0):
X_train = X[train]
X_test = X[test]
t0 = time.time()
tree = Tree(X_train)
train_times.append(time.time() - t0)
t0 = time.time()
tree.query(X_test)
test_times.append(time.time() - t0)
# remove extremal values
train_times.pop(np.argmax(train_times))
train_times.pop(np.argmin(train_times))
train_times = np.array(train_times)
test_times.pop(np.argmax(test_times))
test_times.pop(np.argmin(test_times))
test_times = np.array(test_times)
print '%s train time %.4fs (%.4fs) / test time %.4fs (%.4fs)' % (
tree_name, np.mean(train_times),
np.std(train_times), np.mean(test_times),
np.std(test_times))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment