Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@beckernick
Last active October 12, 2022 13:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beckernick/330fbed460061c7f64cfa9929111cdd1 to your computer and use it in GitHub Desktop.
Save beckernick/330fbed460061c7f64cfa9929111cdd1 to your computer and use it in GitHub Desktop.
hdbscan all points membership vectors benchmark
import time
import json
import os
from datetime import datetime
import numpy as np
import cuml
import hdbscan
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
# Warmup
clusterer = cuml.cluster.hdbscan.HDBSCAN(
prediction_data=True
)
clusterer.fit(np.arange(1000).reshape(50,20))
# Params
MIN_SAMPLES = 50
MIN_CLUSTER_SIZE = 5
NFEATURES = [
5,
]
BACKENDS = [
"cuml",
"hdbscan",
]
SIZES = [
25000,
50000,
100000,
200000,
400000,
]
DATE_TAG = datetime.now().strftime("%Y-%m-%d")
outpath = f"hdbscan-apmv-benchmark-results-{DATE_TAG}.jsonl"
if os.path.exists(outpath):
os.remove(outpath)
for n in SIZES:
for k in NFEATURES:
for library in BACKENDS:
reduced_path = f"million_news_articles_embeddings_reduced_{n}_{k}.npy"
reduced_data = np.load(reduced_path)
if library == "cuml":
backend = cuml.cluster.hdbscan
else:
backend = hdbscan
benchmark_payload = {}
with Timer() as fit_timer:
clusterer = backend.HDBSCAN(
min_samples=MIN_SAMPLES,
min_cluster_size=MIN_CLUSTER_SIZE,
metric='euclidean',
prediction_data=True
)
clusterer.fit(reduced_data)
nclusters = len(np.unique(clusterer.labels_))
with Timer() as membership_timer:
soft_clusters = backend.all_points_membership_vectors(clusterer)
benchmark_payload["backend"] = library
benchmark_payload["nrows"] = n
benchmark_payload["min_samples"] = MIN_SAMPLES
benchmark_payload["min_cluster_size"] = MIN_CLUSTER_SIZE
benchmark_payload["num_clusters"] = nclusters
benchmark_payload["fit_time"] = fit_timer.elapsed
benchmark_payload["membership_time"] = membership_timer.elapsed
print(benchmark_payload)
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment