Skip to content

Instantly share code, notes, and snippets.

@Erotemic
Created October 24, 2016 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Erotemic/e1c8f3d8ada70194e9b4b51f23999ff3 to your computer and use it in GitHub Desktop.
Save Erotemic/e1c8f3d8ada70194e9b4b51f23999ff3 to your computer and use it in GitHub Desktop.
Benchmark of _labels_inertia_precompute_dense
# -*- coding: utf-8 -*-
from __future__ import print_function, division, absolute_import, unicode_literals
import time
import itertools as it
from sklearn.utils.extmath import row_norms
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
import numpy as np
class Timer(object):
def __init__(self):
self.tstart = -1
self.ellapsed = -1
self.default_timer = time.time
def tic(self):
self.tstart = self.default_timer()
def toc(self):
ellapsed = (self.default_timer() - self.tstart)
return ellapsed
def __enter__(self):
self.tic()
return self
def __exit__(self, type_, value, trace):
self.ellapsed = self.toc()
if trace is not None:
return False
def time_func(func_tup, iters=10):
times = []
func = func_tup[0]
args = func_tup[1:]
for i in range(iters):
with Timer() as t:
func(*args)
times.append(t.ellapsed)
ave_time = sum(times) / len(times)
return ave_time
def all_dict_combinations(varied_dict):
tups_list = [[(key, val) for val in val_list]
if isinstance(val_list, (list))
else [(key, val_list)]
for (key, val_list) in sorted(varied_dict.items())]
dict_list = [dict(tups) for tups in it.product(*tups_list)]
return dict_list
def new_labels_inertia_precompute_dense(X, x_squared_norms, centers):
n_samples = X.shape[0] # NOQA
# Breakup nearest neighbor distance computation into batches to prevent
# memory blowup
metric_kwargs = dict(squared=True)
# metric_kwargs = dict(squared=True, check_inputs=False)
labels, mindist = pairwise_distances_argmin_min(
X=X, Y=centers, metric='euclidean', metric_kwargs=metric_kwargs)
# batch_size = 500
# cython k-means code assumes int32 inputs
labels = labels.astype(np.int32)
# if n_samples == distances.shape[0]:
# # distances will be changed in-place
# distances[:] = mindist
# inertia = mindist.sum()
# return labels, inertia
def old_labels_inertia_precompute_dense(X, x_squared_norms, centers):
n_samples = X.shape[0]
k = centers.shape[0]
all_distances = euclidean_distances(centers, X, x_squared_norms,
squared=True)
labels = np.empty(n_samples, dtype=np.int32)
labels.fill(-1)
mindist = np.empty(n_samples)
mindist.fill(np.infty)
for center_id in range(k):
dist = all_distances[center_id]
labels[dist < mindist] = center_id
mindist = np.minimum(dist, mindist)
# if n_samples == distances.shape[0]:
# # distances will be changed in-place
# distances[:] = mindist
# inertia = mindist.sum()
# return labels, inertia
def make_X(n_clusters=2000, n_features=128, n_samples=10, dtype=np.float32):
rng = np.random.RandomState(42)
centers = rng.rand(n_samples, n_features)
n_samples = n_samples
n_clusters, n_features = centers.shape
X = rng.rand(n_samples, n_features)
X = X.astype(dtype)
x_squared_norms = row_norms(X)
return X, x_squared_norms, centers
def single_benchmark(n_clusters, n_features, n_samples, dtype=np.float32):
X, x_squared_norms, centers = make_X(n_clusters, n_features, n_samples, dtype)
batch_size = 500
dtype_bytes = dtype(0).nbytes
measures = {}
measures['MB_old'] = (X.shape[0] * centers.shape[0] * dtype_bytes) / 2 ** 20
measures['MB_new'] = (min(batch_size, X.shape[0]) * centers.shape[0] * dtype_bytes) / 2 ** 20
print('measures = %r' % (measures,))
measures['old_speed'] = time_func((old_labels_inertia_precompute_dense, X, x_squared_norms, centers))
measures['new_speed'] = time_func((new_labels_inertia_precompute_dense, X, x_squared_norms, centers))
return measures
def run_benchmark():
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.width = 1000
vals = []
basis = {
'n_clusters': [10, 100, 1000, 10000][::-1],
'n_features': [16, 32, 128][::-1],
'n_samples': [10, 100, 100000][::-1],
}
for kw in all_dict_combinations(basis):
print('kw = %r' % (kw,))
measures = single_benchmark(**kw)
kw.update(measures)
vals.append(kw)
print('---------')
df = pd.DataFrame.from_dict(vals)
df['percent_change'] = 100 * (df['old_speed'] - df['new_speed']) / df['old_speed']
new_keys = ['MB_new', 'MB_old', 'new_speed', 'old_speed', 'percent_change']
old_keys = sorted(set(df.columns) - set(new_keys))
df = df.reindex_axis(old_keys + new_keys, axis=1)
df['absolute_change'] = (df['old_speed'] - df['new_speed'])
print(df.sort_values('absolute_change', ascending=False))
# print(df['percent_change'][df['absolute_change'] > .1].mean())
if __name__ == '__main__':
run_benchmark()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment