Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
bench feature agglomeration
Benchmarks np.bincount method vs np.mean for feature agglomeration in
../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides
a significant speed up if the pooling function is np.mean.
np.bincount performs better especially as the size of X and n_clusters
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import FeatureAgglomeration
import time
def fit_agglomeration(n_features, n_clusters):
X = rng.randn(200, n_features)
agglo = FeatureAgglomeration(n_clusters=n_clusters)
return X, agglo
def get_transformed_array(X, agglo, method):
size = np.bincount(agglo.labels_)
n_samples = X.shape[0]
nX = []
if len(agglo.labels_) != X.shape[1]:
raise ValueError("X has a different number of features than "
"during fitting.")
if method == "bincount":
# a fast way to compute the mean of grouped features
nX = np.array([np.bincount(agglo.labels_, X[i, :]) / size
for i in range(n_samples)])
elif method == "np_mean":
for l in np.unique(agglo.labels_):
nX.append(np.mean(X[:, agglo.labels_ == l], axis=1))
nX = np.array(nX).T
raise ValueError("Method can have a value of 'bincount' or 'np.mean'")
return nX
if __name__ == "__main__":
rng = np.random.RandomState(0)
for n_clusters in [1, 5, 10, 50, 100]:
times_mean = []
times_bincount = []
n_features_this = []
for n_features in [2, 10, 100, 200, 500, 1000]:
if n_clusters >= n_features:
print(n_features, n_clusters)
X, agglo = fit_agglomeration(n_features, n_clusters)
X = rng.randn(100000, n_features)
tick = time.time()
result_bincount = get_transformed_array(X, agglo, "bincount")
time_bincount = time.time() - tick
tick = time.time()
result_np_mean = get_transformed_array(X, agglo, "np_mean")
time_np_mean = time.time() - tick
print('Took %s seconds using np.bincount' % (time_bincount))
print('Took %s seconds using np.mean' % (time_np_mean))
print("np.bincount is %s times faster" % (time_np_mean/time_bincount))
plt.plot(n_features_this, times_mean, '--', label="mean n_clusters={}".format(n_clusters))
plt.plot(n_features_this, times_bincount, label="bincount n_clusters={}".format(n_clusters))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.