Last active
May 26, 2022 19:20
-
-
Save Micky774/0d0903d411efe88ad72ecb68305d2cd1 to your computer and use it in GitHub Desktop.
Benchmark file for `_assert_all_finite`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import numpy as np | |
import scipy.sparse as sp | |
def generate_data(n_samples, n_features, X_density=1, y_sparse=False, dtype=np.float64, random_state=None): | |
rng = np.random.RandomState(random_state) | |
if X_density < 1: | |
X = sp.random(n_samples, n_features, format="csr", density=X_density, random_state=rng) | |
else: | |
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype) | |
y = np.round(rng.rand(n_samples,)+1).astype(dtype) | |
if y_sparse: | |
y = sp.csr_matrix(y) | |
if y_sparse and y.shape[0] == 1: | |
y = y.T | |
return X, y | |
def make_non_finite(X, p_inf=0, p_nan=0, random_state=None): | |
rng = np.random.RandomState(random_state) | |
p = p_inf+p_nan | |
if p==0: | |
return X | |
X = X.ravel() | |
indices = rng.choice(np.arange(X.size), replace=False, size=int(X.size * p)) | |
partition = 0 | |
partition = int(indices.size * (p_inf/p)) | |
if p_inf>0: | |
X[indices[:partition]] = np.inf | |
if p_nan>0: | |
X[indices[partition:]] = np.nan | |
return X | |
def generate_non_finite_data(p_inf=0, p_nan=0, *args, **kwargs): | |
X = generate_data(*args, **kwargs)[0] | |
return make_non_finite(X, p_inf, p_nan, kwargs["random_state"]) | |
# %% | |
from functools import partial | |
from time import perf_counter | |
from statistics import mean, stdev | |
from itertools import product | |
import csv | |
from sklearn.utils.validation import _assert_all_finite | |
results_path = 'local_artifacts/benchmarks/assert_all_finite/' | |
branch = "main" | |
def __assert_all_finite(*args, **kwargs): | |
try: | |
_assert_all_finite(*args, **kwargs) | |
except ValueError: | |
return | |
benchmark_config = [ | |
( | |
__assert_all_finite, | |
partial(generate_non_finite_data, n_samples=10_000, n_features=1_000), | |
product( | |
[0, 0.01], | |
[0, 0.01], | |
[np.dtype("float32"), np.dtype("float64")], | |
), | |
), | |
] | |
N_REPEATS = 10 | |
with open(f'{results_path}{branch}.csv', 'w', newline='') as csvfile: | |
writer = csv.DictWriter( | |
csvfile, | |
fieldnames=[ | |
"p_inf", | |
"p_nan", | |
"dtype", | |
"n_repeat", | |
"duration", | |
], | |
) | |
writer.writeheader() | |
for func, make_data, items in benchmark_config: | |
for p_inf, p_nan, dtype in items: | |
time_results = [] | |
for n_repeat in range(N_REPEATS): | |
X= make_data(random_state=n_repeat, p_inf=p_inf, p_nan=p_nan, dtype=dtype) | |
start = perf_counter() | |
func(X) | |
duration = perf_counter() - start | |
time_results.append(duration) | |
writer.writerow( | |
{ | |
"p_inf": p_inf, | |
"p_nan": p_nan, | |
"dtype": dtype.name, | |
"n_repeat": n_repeat, | |
"duration": duration, | |
} | |
) | |
results_mean, results_stdev = mean(time_results), stdev(time_results) | |
print( | |
f" {p_inf=} {p_nan=} {dtype.name=}|" | |
f" {results_mean:.3f} +/- {results_stdev:.3f}" | |
) | |
# %% | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import seaborn as sns | |
plt.rc('font', size=12) | |
_branches = ("main", "PR") | |
# _branch_1 = "PR_cdef" | |
# _branch_2 = "PR" | |
# branch_1 = pd.read_csv(f'{results_path}{_branch_1}.csv') | |
# branch_2 = pd.read_csv(f'{results_path}{_branch_2}.csv') | |
branches = {br:pd.read_csv(f'{results_path}{br}.csv') for br in _branches} | |
df = pd.concat([branches[br].assign(branch=br) for br in _branches]) | |
group_by_attrs = ["p_inf", "p_nan", "dtype"] | |
grouped = list(df.groupby(group_by_attrs)) | |
fig, axis = plt.subplots(2, 4, figsize=(14, 6), constrained_layout=True) | |
for (grouped_attrs, subset), ax in zip(grouped, axis.reshape(-1)): | |
sns.violinplot(data=subset, y="duration", x="branch", ax=ax) | |
ax.set_title("|".join([str(attr) for attr in grouped_attrs])) | |
ax.set_xlabel("") | |
for ax in axis[:, 1:].ravel(): | |
ax.set_ylabel("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment