Skip to content

Instantly share code, notes, and snippets.

@Micky774
Created June 2, 2023 02:58
Show Gist options
  • Save Micky774/f31b6242ec5fb364a2d683611b259f35 to your computer and use it in GitHub Desktop.
Save Micky774/f31b6242ec5fb364a2d683611b259f35 to your computer and use it in GitHub Desktop.
Generate benchmark for `slsdm` distance metrics for `KNearestNeighbors`
# %%
from pathlib import Path
results_path = 'local_artifacts/benchmarks/KNR/'
results_path += '/' if results_path[-1] != '/' else ''
Path(results_path).mkdir(parents=True, exist_ok=True)
results_path += "data.csv"
# %%
from slsdm import get_distance_metric
from sklearn.metrics._dist_metrics import DistanceMetric, DistanceMetric32
from sklearn.neighbors import KNeighborsRegressor
from statistics import mean, stdev
from time import perf_counter
from functools import partial
from itertools import product
import numpy as np
import csv
SKLEARN = "sklearn"
SLSDM = "slsdm"
METRIC = 'manhattan'
def _generate_PWD_data(n_samples_X, n_samples_Y, n_features, n_classes, n_outs=1, random_state=0):
rng = np.random.RandomState(random_state)
X = rng.randn(n_samples_X, n_features)
Y = rng.randn(n_samples_Y, n_features)
y_shape = (n_samples_X,) if n_outs == 1 else (n_samples_X, n_outs)
y = rng.randint(n_classes, size=y_shape)
return X, Y, y
N_FEATURES = 500
benchmark_config = [
(
partial(_generate_PWD_data, n_features=N_FEATURES, n_classes=2),
product(
[5_000, 20_000],
[5_000, 20_000],
[np.float32, np.float64],
[SKLEARN, SLSDM]
),
),
]
N_REPEATS = 20
with open(results_path, 'w', newline='') as csvfile:
writer = csv.DictWriter(
csvfile,
fieldnames=[
"n_samples",
"n_samples_test",
"dtype",
"n_repeat",
"duration",
"package",
],
)
writer.writeheader()
for make_data, items in benchmark_config:
for n_samples, n_samples_test, dtype, package in items:
time_results = []
dist = {
SLSDM : get_distance_metric(np.array([0], dtype=dtype), METRIC),
SKLEARN : {
"float32":DistanceMetric32.get_metric(METRIC),
"float64":DistanceMetric.get_metric(METRIC),
}[dtype.__name__]
}[package]
for n_repeat in range(N_REPEATS):
X, Y, y = make_data(n_samples_X=n_samples, n_samples_Y=n_samples_test, random_state=n_repeat)
X = X.astype(dtype)
Y = Y.astype(dtype)
neigh = KNeighborsRegressor(n_neighbors=100, algorithm='brute', metric=dist)
neigh.fit(X, y)
start = perf_counter()
neigh.predict(Y)
duration = perf_counter() - start
time_results.append(duration)
writer.writerow(
{
"n_samples": n_samples,
"n_samples_test": n_samples_test,
"dtype": dtype.__name__,
"n_repeat": n_repeat,
"duration": duration,
"package": package,
}
)
results_mean, results_stdev = mean(time_results), stdev(time_results)
print(
f" {n_samples=}, {n_samples_test=}, dtype={dtype.__name__}, {package=}|"
f" {results_mean:.3f} +/- {results_stdev:.3f}"
)
# %%
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc('font', size=12)
GRID_LAYOUT = (2, 4)
FIGURE_SIZE = (14, 9)
def _violen_perf(subset, ax, **kwargs):
sns.violinplot(data=subset, y="duration", x="package", ax=ax)
def _rel_perf(subset, ax, default, **kwargs):
base = subset.groupby("package")["duration"].mean()[default]
subset["duration"] = base / subset["duration"]
y_title = "speedup vs main"
subset = subset.rename(columns={"duration":y_title})
graph = sns.barplot(subset, x="package", y=y_title, errorbar='sd', ax=ax)
graph.axhline(1, color="black")
def _abs_perf(subset, ax, **kwargs):
base = subset.groupby("package")["duration"].mean().min()
subset = subset.rename(columns={"duration":"time (sec)"})
graph = sns.barplot(subset, x="package", y="time (sec)", errorbar='sd', ax=ax)
graph.axhline(base, color="black")
def generic_chart(func, grouped, percentile_trim, packages, group_by_attrs, **kwargs):
grouped_list = list(grouped)
fig, axis = plt.subplots(*GRID_LAYOUT, figsize=FIGURE_SIZE, constrained_layout=True)
fig.patch.set_facecolor('white')
for (grouped_attrs, subset), ax in zip(grouped_list, axis.reshape(-1)):
# Optionally trim outlier data
if percentile_trim < 1:
for package in packages:
_subset = subset[subset["package"]==package]
cut = _subset.duration < _subset.duration.quantile(percentile_trim)
subset[subset["package"]==package] = _subset[cut]
func(subset, ax, **kwargs)
ax.set_title("\n".join( [f"{k}={v}" for k, v in zip(group_by_attrs, grouped_attrs)]))
ax.set_xlabel("")
for ax in axis[:, 1:].ravel():
ax.set_ylabel("")
fig.suptitle(f"metric = '{METRIC}', n_features={N_FEATURES}\nsklearn (6aaf2aa) | slsdm (dd7566d)", fontsize=18)
plt.show()
# %%
percentile_trim = .9
df = pd.read_csv(results_path)
group_by_attrs = ["dtype", "n_samples", "n_samples_test"]
grouped = list(df.groupby(group_by_attrs))
grouped_cp = list(df.groupby(group_by_attrs))
default_args = dict(percentile_trim=percentile_trim, packages=(SKLEARN, SLSDM), group_by_attrs=group_by_attrs, default=SKLEARN)
# generic_chart(_violen_perf, df.groupby(group_by_attrs), **default_args)
generic_chart(_rel_perf, df.groupby(group_by_attrs), **default_args)
generic_chart(_abs_perf, df.groupby(group_by_attrs), **default_args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment