amueller/bench_feat_agg.py

## bench_feat_agg.py
"""
Benchmarks np.bincount method vs np.mean for feature agglomeration in
../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides
a significant speed up if the pooling function is np.mean.

np.bincount performs better especially as the size of X and n_clusters
increase.
"""
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import FeatureAgglomeration
import time


def fit_agglomeration(n_features, n_clusters):
    X = rng.randn(200, n_features)
    agglo = FeatureAgglomeration(n_clusters=n_clusters)
    agglo.fit(X)
    return X, agglo


def get_transformed_array(X, agglo, method):
    size = np.bincount(agglo.labels_)
    n_samples = X.shape[0]
    nX = []
    if len(agglo.labels_) != X.shape[1]:
        raise ValueError("X has a different number of features than "
                         "during fitting.")
    if method == "bincount":
        # a fast way to compute the mean of grouped features
        nX = np.array([np.bincount(agglo.labels_, X[i, :]) / size
                       for i in range(n_samples)])
    elif method == "np_mean":
        for l in np.unique(agglo.labels_):
            nX.append(np.mean(X[:, agglo.labels_ == l], axis=1))
        nX = np.array(nX).T
    else:
        raise ValueError("Method can have a value of 'bincount' or 'np.mean'")
    return nX


if __name__ == "__main__":
    rng = np.random.RandomState(0)
    for n_clusters in [1, 5, 10, 50, 100]:
        times_mean = []
        times_bincount = []
        n_features_this = []
        for n_features in [2, 10, 100, 200, 500, 1000]:
            if n_clusters >= n_features:
                continue
            n_features_this.append(n_features)
            print(n_features, n_clusters)
            X, agglo = fit_agglomeration(n_features, n_clusters)
            X = rng.randn(100000, n_features)

            tick = time.time()
            result_bincount = get_transformed_array(X, agglo, "bincount")
            time_bincount = time.time() - tick

            tick = time.time()
            result_np_mean = get_transformed_array(X, agglo, "np_mean")
            time_np_mean = time.time() - tick

            print('==================')
            print('Took %s seconds using np.bincount' % (time_bincount))
            print('Took %s seconds using np.mean' % (time_np_mean))
            print('==================')
            print("np.bincount is %s times faster" % (time_np_mean/time_bincount))
            times_mean.append(time_np_mean)
            times_bincount.append(time_bincount)
        plt.plot(n_features_this, times_mean, '--', label="mean n_clusters={}".format(n_clusters))
        plt.plot(n_features_this, times_bincount, label="bincount n_clusters={}".format(n_clusters))
plt.xlabel("n_features")
plt.ylabel("time")
plt.legend()
plt.show()
	"""
	Benchmarks np.bincount method vs np.mean for feature agglomeration in
	../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides
	a significant speed up if the pooling function is np.mean.

	np.bincount performs better especially as the size of X and n_clusters
	increase.
	"""
	import matplotlib.pyplot as plt
	import numpy as np
	from sklearn.cluster import FeatureAgglomeration
	import time


	def fit_agglomeration(n_features, n_clusters):
	X = rng.randn(200, n_features)
	agglo = FeatureAgglomeration(n_clusters=n_clusters)
	agglo.fit(X)
	return X, agglo


	def get_transformed_array(X, agglo, method):
	size = np.bincount(agglo.labels_)
	n_samples = X.shape[0]
	nX = []
	if len(agglo.labels_) != X.shape[1]:
	raise ValueError("X has a different number of features than "
	"during fitting.")
	if method == "bincount":
	# a fast way to compute the mean of grouped features
	nX = np.array([np.bincount(agglo.labels_, X[i, :]) / size
	for i in range(n_samples)])
	elif method == "np_mean":
	for l in np.unique(agglo.labels_):
	nX.append(np.mean(X[:, agglo.labels_ == l], axis=1))
	nX = np.array(nX).T
	else:
	raise ValueError("Method can have a value of 'bincount' or 'np.mean'")
	return nX


	if __name__ == "__main__":
	rng = np.random.RandomState(0)
	for n_clusters in [1, 5, 10, 50, 100]:
	times_mean = []
	times_bincount = []
	n_features_this = []
	for n_features in [2, 10, 100, 200, 500, 1000]:
	if n_clusters >= n_features:
	continue
	n_features_this.append(n_features)
	print(n_features, n_clusters)
	X, agglo = fit_agglomeration(n_features, n_clusters)
	X = rng.randn(100000, n_features)

	tick = time.time()
	result_bincount = get_transformed_array(X, agglo, "bincount")
	time_bincount = time.time() - tick

	tick = time.time()
	result_np_mean = get_transformed_array(X, agglo, "np_mean")
	time_np_mean = time.time() - tick

	print('==================')
	print('Took %s seconds using np.bincount' % (time_bincount))
	print('Took %s seconds using np.mean' % (time_np_mean))
	print('==================')
	print("np.bincount is %s times faster" % (time_np_mean/time_bincount))
	times_mean.append(time_np_mean)
	times_bincount.append(time_bincount)
	plt.plot(n_features_this, times_mean, '--', label="mean n_clusters={}".format(n_clusters))
	plt.plot(n_features_this, times_bincount, label="bincount n_clusters={}".format(n_clusters))
	plt.xlabel("n_features")
	plt.ylabel("time")
	plt.legend()
	plt.show()