Created
February 15, 2021 16:47
-
-
Save jjerphan/0cd9adf82fe795e66e5d492da0be85bb to your computer and use it in GitHub Desktop.
Benchmark for scikit-learn/scikit-learn#17169
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import itertools | |
import time | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from matplotlib import pyplot as plt | |
from sklearn.datasets import make_classification | |
from sklearn.feature_selection import SelectKBest, abs_r_regression, f_regression | |
class Benchmark(object): | |
""" | |
Utility to benchmark. | |
:param n_samples: number of samples of the dataset to use for the benchmark | |
:param n_trials: number of trials of each test for the benchmark | |
""" | |
def __init__(self, n_samples=100000, n_features=10, n_trials=20): | |
self.n_samples = n_samples | |
self.n_features = n_features | |
self.n_trials = n_trials | |
self.X, self.y = make_classification( | |
n_samples=self.n_samples, | |
n_features=self.n_features, | |
n_redundant=0, | |
random_state=42, | |
) | |
def to_benchmark(self, *args, **kwargs): | |
raise NotImplementedError | |
def benchmark(self, func): | |
""" | |
Return a proxy function to run benchmark | |
on the original one. | |
:param func: function | |
:return: | |
""" | |
def proxy(*args, **key_args): | |
times = [] | |
gc.collect() | |
for _ in range(self.n_trials): | |
t1 = time.time() | |
_ = func(*args, **key_args) | |
t2 = time.time() | |
times.append(t2 - t1) | |
mean = np.mean(times) | |
std = np.std(times) | |
med = np.median(times) | |
print("{} trials: {:.4f} ± {:.4f} s (median = {:.4f} s)" | |
.format(self.n_trials, mean, std, med)) | |
return args, key_args, times | |
return proxy | |
def run(self, plot=True ): | |
res = [] | |
for values in itertools.product(*self.params): | |
args = dict(zip(self.param_names, values)) | |
print(f"Dataset-shape: ({self.n_samples}, {self.n_features})", | |
args, "\t", end=" ") | |
res.append(self.benchmark(self.to_benchmark)(**args)) | |
if plot: | |
self._plot_res(res) | |
print() | |
return res | |
def _plot_res(self, res): | |
plt.figure(figsize=(21, 13)) | |
df = pd.DataFrame() | |
for args, key_args, times in res: | |
d = pd.DataFrame(dict(times=times)) | |
key_args_string = "\n".join(list( | |
map(lambda x: f"{x[0]}: {x[1]}", key_args.items()))) | |
d['args'] = key_args_string | |
df = pd.concat([df, d]) | |
ax = sns.boxplot(x="args", y="times", data=df, whis=0.4) | |
ax.set_ylabel("Execution time (in sec.)") | |
ax.xaxis.set_tick_params(rotation=45) | |
plt.title(f"{self.__class__.__doc__ }" | |
f" — dataset-shape: ({self.n_samples}, " | |
f"{self.n_features}) — {self.n_trials} trials") | |
ax.yaxis.grid(True) | |
plt.savefig(f"{self.__class__.__name__.lower()}_{self.n_samples}" | |
f"_{self.n_features}_{self.n_trials}.png") | |
plt.show() | |
plt.close() | |
class BenchmarkSelectKBest(Benchmark): | |
""" f_regression vs r_regression for features selection""" | |
param_names = ["scorer"] | |
params = [[f_regression, abs_r_regression]] | |
def to_benchmark(self, scorer): | |
SelectKBest(scorer).fit_transform(self.X, self.y) | |
if __name__ == "__main__": | |
n_trials = 100 | |
n_features_set = [10, 100, 1000] | |
n_samples_set = [10_000, 100_000] | |
for n_samples in n_samples_set: | |
for n_features in n_features_set: | |
BenchmarkSelectKBest( | |
n_samples=n_samples, | |
n_features=n_features, | |
n_trials=n_trials).run(plot=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment