Skip to content

Instantly share code, notes, and snippets.

@Micky774
Last active May 26, 2022 19:20
Show Gist options
  • Save Micky774/0d0903d411efe88ad72ecb68305d2cd1 to your computer and use it in GitHub Desktop.
Save Micky774/0d0903d411efe88ad72ecb68305d2cd1 to your computer and use it in GitHub Desktop.
Benchmark file for `_assert_all_finite`
# %%
import numpy as np
import scipy.sparse as sp
def generate_data(n_samples, n_features, X_density=1, y_sparse=False, dtype=np.float64, random_state=None):
rng = np.random.RandomState(random_state)
if X_density < 1:
X = sp.random(n_samples, n_features, format="csr", density=X_density, random_state=rng)
else:
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype)
y = np.round(rng.rand(n_samples,)+1).astype(dtype)
if y_sparse:
y = sp.csr_matrix(y)
if y_sparse and y.shape[0] == 1:
y = y.T
return X, y
def make_non_finite(X, p_inf=0, p_nan=0, random_state=None):
rng = np.random.RandomState(random_state)
p = p_inf+p_nan
if p==0:
return X
X = X.ravel()
indices = rng.choice(np.arange(X.size), replace=False, size=int(X.size * p))
partition = 0
partition = int(indices.size * (p_inf/p))
if p_inf>0:
X[indices[:partition]] = np.inf
if p_nan>0:
X[indices[partition:]] = np.nan
return X
def generate_non_finite_data(p_inf=0, p_nan=0, *args, **kwargs):
X = generate_data(*args, **kwargs)[0]
return make_non_finite(X, p_inf, p_nan, kwargs["random_state"])
# %%
from functools import partial
from time import perf_counter
from statistics import mean, stdev
from itertools import product
import csv
from sklearn.utils.validation import _assert_all_finite
results_path = 'local_artifacts/benchmarks/assert_all_finite/'
branch = "main"
def __assert_all_finite(*args, **kwargs):
try:
_assert_all_finite(*args, **kwargs)
except ValueError:
return
benchmark_config = [
(
__assert_all_finite,
partial(generate_non_finite_data, n_samples=10_000, n_features=1_000),
product(
[0, 0.01],
[0, 0.01],
[np.dtype("float32"), np.dtype("float64")],
),
),
]
N_REPEATS = 10
with open(f'{results_path}{branch}.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(
csvfile,
fieldnames=[
"p_inf",
"p_nan",
"dtype",
"n_repeat",
"duration",
],
)
writer.writeheader()
for func, make_data, items in benchmark_config:
for p_inf, p_nan, dtype in items:
time_results = []
for n_repeat in range(N_REPEATS):
X= make_data(random_state=n_repeat, p_inf=p_inf, p_nan=p_nan, dtype=dtype)
start = perf_counter()
func(X)
duration = perf_counter() - start
time_results.append(duration)
writer.writerow(
{
"p_inf": p_inf,
"p_nan": p_nan,
"dtype": dtype.name,
"n_repeat": n_repeat,
"duration": duration,
}
)
results_mean, results_stdev = mean(time_results), stdev(time_results)
print(
f" {p_inf=} {p_nan=} {dtype.name=}|"
f" {results_mean:.3f} +/- {results_stdev:.3f}"
)
# %%
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc('font', size=12)
_branches = ("main", "PR")
# _branch_1 = "PR_cdef"
# _branch_2 = "PR"
# branch_1 = pd.read_csv(f'{results_path}{_branch_1}.csv')
# branch_2 = pd.read_csv(f'{results_path}{_branch_2}.csv')
branches = {br:pd.read_csv(f'{results_path}{br}.csv') for br in _branches}
df = pd.concat([branches[br].assign(branch=br) for br in _branches])
group_by_attrs = ["p_inf", "p_nan", "dtype"]
grouped = list(df.groupby(group_by_attrs))
fig, axis = plt.subplots(2, 4, figsize=(14, 6), constrained_layout=True)
for (grouped_attrs, subset), ax in zip(grouped, axis.reshape(-1)):
sns.violinplot(data=subset, y="duration", x="branch", ax=ax)
ax.set_title("|".join([str(attr) for attr in grouped_attrs]))
ax.set_xlabel("")
for ax in axis[:, 1:].ravel():
ax.set_ylabel("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment