Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created February 4, 2021 15:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ogrisel/39b7156a2938dd2cd28448a58631bd78 to your computer and use it in GitHub Desktop.
Save ogrisel/39b7156a2938dd2cd28448a58631bd78 to your computer and use it in GitHub Desktop.
# %%
from time import perf_counter
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import kmeans_plusplus
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from subprocess import run
from pprint import pprint
import pandas as pd
import numpy as np
import joblib
m = joblib.Memory(location="/tmp/joblib")
make_blobs = m.cache(make_blobs)
test_size = 50_000
n_samples = 1_000_000 # training set
n_features = 50
n_clusters = 300
n_blobs = 10
git_branch = run("git branch --show-current".split(),
capture_output=True).stdout.decode("utf-8").strip()
# @m.cache
def time_kmpp(data, n_clusters, random_state=0):
tic = perf_counter()
centers_init, _ = kmeans_plusplus(
data, n_clusters, random_state=random_state)
kmpp_duration = perf_counter() - tic
return centers_init, kmpp_duration
def inertia_per_sample(data, labels, n_clusters):
ss = 0.0
for i in range(n_clusters):
data_i = data[labels == i]
if data_i.shape[0] == 0:
continue
centroid_i = data_i.mean(axis=0)
ss += ((data_i - centroid_i) ** 2).sum()
return ss / data.shape[0]
results = []
for seed in range(1):
data, _ = make_blobs(n_samples=n_samples + test_size,
n_features=n_features,
centers=n_blobs,
random_state=seed)
data = data.astype(np.float32)
task_id = joblib.hash((data, n_clusters))
print(f"\n# Benchmarking on task {task_id}")
data_train, data_test = train_test_split(data, test_size=test_size,
random_state=0)
centers_init, kmpp_duration = time_kmpp(data_train[:int(1e4)], n_clusters)
print(f"kmeans_plus_plus duration: {kmpp_duration:.1f} s")
kmpp_labels_train = cdist(data_train, centers_init).argmin(axis=1)
inertia_kmpp_train = inertia_per_sample(data_train, kmpp_labels_train,
n_clusters)
print(f"Initial train inertia per samples after km++ {inertia_kmpp_train}")
kmpp_labels_test = cdist(data_test, centers_init).argmin(axis=1)
inertia_kmpp_test = inertia_per_sample(data_test, kmpp_labels_test,
n_clusters)
print(f"Initial test inertia per samples after km++ {inertia_kmpp_test}")
kmeans = MiniBatchKMeans(n_clusters=n_clusters,
init=centers_init.copy(),
n_init=1,
max_no_improvement=10,
batch_size=4096, random_state=0)
tic = perf_counter()
kmeans.fit(data_train)
duration = perf_counter() - tic
results.append(dict(
task_id=task_id,
git_branch=git_branch,
n_samples=n_samples,
n_features=n_features,
n_clusters=n_clusters,
n_blobs=n_blobs,
model_type="MinibatchKMeans",
seed=seed,
duration=duration,
inertia_per_sample_train=inertia_per_sample(data_train,
kmeans.predict(data_train),
n_clusters),
inertia_per_sample_test=inertia_per_sample(data_test,
kmeans.predict(data_test),
n_clusters),
n_iter=int(kmeans.n_iter_),
))
pprint(results[-1])
kmeans = KMeans(n_clusters=n_clusters,
init=centers_init.copy(),
n_init=1,
max_iter=10000,
algorithm="full", random_state=0)
tic = perf_counter()
kmeans.fit(data_train)
duration = perf_counter() - tic
results.append(dict(
task_id=task_id,
git_branch=git_branch,
n_samples=n_samples,
n_features=n_features,
n_clusters=n_clusters,
n_blobs=n_blobs,
model_type="KMeans",
algorithm="full",
seed=seed,
duration=duration,
inertia_per_sample_train=inertia_per_sample(data_train,
kmeans.predict(data_train),
n_clusters),
inertia_per_sample_test=inertia_per_sample(data_test,
kmeans.predict(data_test),
n_clusters),
n_iter=kmeans.n_iter_,
))
pprint(results[-1])
# %%
pd.DataFrame(results).to_json("kmeans_eval.json")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment