Skip to content

Instantly share code, notes, and snippets.

@Micky774
Created March 24, 2023 23:45
Show Gist options
  • Save Micky774/68798085b8fca7452d3975e484657330 to your computer and use it in GitHub Desktop.
Save Micky774/68798085b8fca7452d3975e484657330 to your computer and use it in GitHub Desktop.
csr_polynomial benchmark
import numpy as np
from scipy import sparse as sp
def generate_data(n_samples, n_features, n_classes=2, X_density=1, y_sparse=False, dtype=np.float64, random_state=None):
rng = np.random.RandomState(random_state)
if X_density < 1:
X = sp.random(n_samples, n_features, format="csr", density=X_density, random_state=rng)
else:
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype)
y = np.round(rng.randint(n_classes,size=(n_samples,))).astype(dtype)
if y_sparse:
y = sp.csr_matrix(y)
if y_sparse and y.shape[0] == 1:
y = y.T
return X, y
from functools import partial
from time import perf_counter
from statistics import mean, stdev
from itertools import product
import csv
from pathlib import Path
from sklearn.preprocessing import PolynomialFeatures
results_path = 'local_artifacts/benchmarks/csr_polynomial/'
Path(results_path).mkdir(parents=True, exist_ok=True)
branch = "main"
benchmark_config = [
(
PolynomialFeatures,
partial(generate_data, n_samples=2_000),
product(
[50, 100, 175],
[.1, .25, .5],
),
),
]
N_REPEATS = 30
with open(f'{results_path}{branch}.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(
csvfile,
fieldnames=[
"n_features",
"density",
"n_repeat",
"duration",
],
)
writer.writeheader()
for Est, make_data, items in benchmark_config:
for n_features, density in items:
time_results = []
for n_repeat in range(N_REPEATS):
X, _ = make_data(X_density=density, n_features=n_features, random_state=n_repeat)
est = Est(degree=3)
start = perf_counter()
est.fit_transform(X)
duration = perf_counter() - start
time_results.append(duration)
writer.writerow(
{
"n_features": n_features,
"density": density,
"n_repeat": n_repeat,
"duration": duration,
}
)
results_mean, results_stdev = mean(time_results), stdev(time_results)
print(
f" {n_features=} {density=}|"
f" {results_mean:.3f} +/- {results_stdev:.3f}"
)
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc('font', size=12)
results_path = 'local_artifacts/benchmarks/csr_polynomial/'
_branches = ("main", "PR")
percentile_trim = .95
branches = {br:pd.read_csv(f'{results_path}{br}.csv') for br in _branches}
df = pd.concat([branches[br].assign(branch=br) for br in _branches])
group_by_attrs = ["n_features", "density"]
grouped = list(df.groupby(group_by_attrs))
fig, axis = plt.subplots(3, 3, figsize=(14, 9), constrained_layout=True)
fig.patch.set_facecolor('white')
for (grouped_attrs, subset), ax in zip(grouped, axis.reshape(-1)):
# Optionally trim outlier data
if percentile_trim < 1:
for branch in _branches:
_subset = subset[subset["branch"]==branch]
cut = _subset.duration < _subset.duration.quantile(percentile_trim)
subset[subset["branch"]==branch] = _subset[cut]
sns.violinplot(data=subset, y="duration", x="branch", ax=ax)
ax.set_title("|".join( [f"{k}={v}" for k, v in zip(group_by_attrs,grouped_attrs)] ))
ax.set_xlabel("")
for ax in axis[:, 1:].ravel():
ax.set_ylabel("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment