beckernick/benchmark-membership-vectors.py Secret

## benchmark-membership-vectors.py
import time
import json
import os
from datetime import datetime

import numpy as np
import cuml
import hdbscan

class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


# Warmup
clusterer = cuml.cluster.hdbscan.HDBSCAN(
    prediction_data=True
)
clusterer.fit(np.arange(1000).reshape(50,20))

# Params
MIN_SAMPLES = 50
MIN_CLUSTER_SIZE = 5
NFEATURES = [
    5,
]
BACKENDS = [
    "cuml",
    "hdbscan",
]
SIZES = [
    25000,
    50000,
    100000,
    200000,
    400000,
]

DATE_TAG = datetime.now().strftime("%Y-%m-%d")

outpath = f"hdbscan-apmv-benchmark-results-{DATE_TAG}.jsonl"
if os.path.exists(outpath):
    os.remove(outpath)


for n in SIZES:
    for k in NFEATURES:
        for library in BACKENDS:
            reduced_path = f"million_news_articles_embeddings_reduced_{n}_{k}.npy"
            reduced_data = np.load(reduced_path)

            if library == "cuml":
                backend = cuml.cluster.hdbscan
            else:
                backend = hdbscan

            benchmark_payload = {}
            with Timer() as fit_timer:
                clusterer = backend.HDBSCAN(
                    min_samples=MIN_SAMPLES,
                    min_cluster_size=MIN_CLUSTER_SIZE,
                    metric='euclidean',
                    prediction_data=True
                )
                clusterer.fit(reduced_data)
                nclusters = len(np.unique(clusterer.labels_))

            with Timer() as membership_timer:
                soft_clusters = backend.all_points_membership_vectors(clusterer)

            benchmark_payload["backend"] = library
            benchmark_payload["nrows"] = n
            benchmark_payload["min_samples"] = MIN_SAMPLES
            benchmark_payload["min_cluster_size"] = MIN_CLUSTER_SIZE
            benchmark_payload["num_clusters"] = nclusters
            benchmark_payload["fit_time"] = fit_timer.elapsed
            benchmark_payload["membership_time"] = membership_timer.elapsed
            print(benchmark_payload)

            with open(outpath, "a") as fh:
                fh.write(json.dumps(benchmark_payload))
                fh.write("\n")

            time.sleep(1)
	import time
	import json
	import os
	from datetime import datetime

	import numpy as np
	import cuml
	import hdbscan

	class Timer:
	def __enter__(self):
	self.tick = time.time()
	return self

	def __exit__(self, args, *kwargs):
	self.tock = time.time()
	self.elapsed = self.tock - self.tick


	# Warmup
	clusterer = cuml.cluster.hdbscan.HDBSCAN(
	prediction_data=True
	)
	clusterer.fit(np.arange(1000).reshape(50,20))

	# Params
	MIN_SAMPLES = 50
	MIN_CLUSTER_SIZE = 5
	NFEATURES = [
	5,
	]
	BACKENDS = [
	"cuml",
	"hdbscan",
	]
	SIZES = [
	25000,
	50000,
	100000,
	200000,
	400000,
	]

	DATE_TAG = datetime.now().strftime("%Y-%m-%d")

	outpath = f"hdbscan-apmv-benchmark-results-{DATE_TAG}.jsonl"
	if os.path.exists(outpath):
	os.remove(outpath)


	for n in SIZES:
	for k in NFEATURES:
	for library in BACKENDS:
	reduced_path = f"million_news_articles_embeddings_reduced_{n}_{k}.npy"
	reduced_data = np.load(reduced_path)

	if library == "cuml":
	backend = cuml.cluster.hdbscan
	else:
	backend = hdbscan

	benchmark_payload = {}
	with Timer() as fit_timer:
	clusterer = backend.HDBSCAN(
	min_samples=MIN_SAMPLES,
	min_cluster_size=MIN_CLUSTER_SIZE,
	metric='euclidean',
	prediction_data=True
	)
	clusterer.fit(reduced_data)
	nclusters = len(np.unique(clusterer.labels_))

	with Timer() as membership_timer:
	soft_clusters = backend.all_points_membership_vectors(clusterer)

	benchmark_payload["backend"] = library
	benchmark_payload["nrows"] = n
	benchmark_payload["min_samples"] = MIN_SAMPLES
	benchmark_payload["min_cluster_size"] = MIN_CLUSTER_SIZE
	benchmark_payload["num_clusters"] = nclusters
	benchmark_payload["fit_time"] = fit_timer.elapsed
	benchmark_payload["membership_time"] = membership_timer.elapsed
	print(benchmark_payload)

	with open(outpath, "a") as fh:
	fh.write(json.dumps(benchmark_payload))
	fh.write("\n")

	time.sleep(1)