Skip to content

Instantly share code, notes, and snippets.

@jjerphan
Created February 15, 2021 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjerphan/0cd9adf82fe795e66e5d492da0be85bb to your computer and use it in GitHub Desktop.
Save jjerphan/0cd9adf82fe795e66e5d492da0be85bb to your computer and use it in GitHub Desktop.
import gc
import itertools
import time
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, abs_r_regression, f_regression
class Benchmark(object):
"""
Utility to benchmark.
:param n_samples: number of samples of the dataset to use for the benchmark
:param n_trials: number of trials of each test for the benchmark
"""
def __init__(self, n_samples=100000, n_features=10, n_trials=20):
self.n_samples = n_samples
self.n_features = n_features
self.n_trials = n_trials
self.X, self.y = make_classification(
n_samples=self.n_samples,
n_features=self.n_features,
n_redundant=0,
random_state=42,
)
def to_benchmark(self, *args, **kwargs):
raise NotImplementedError
def benchmark(self, func):
"""
Return a proxy function to run benchmark
on the original one.
:param func: function
:return:
"""
def proxy(*args, **key_args):
times = []
gc.collect()
for _ in range(self.n_trials):
t1 = time.time()
_ = func(*args, **key_args)
t2 = time.time()
times.append(t2 - t1)
mean = np.mean(times)
std = np.std(times)
med = np.median(times)
print("{} trials: {:.4f} ± {:.4f} s (median = {:.4f} s)"
.format(self.n_trials, mean, std, med))
return args, key_args, times
return proxy
def run(self, plot=True ):
res = []
for values in itertools.product(*self.params):
args = dict(zip(self.param_names, values))
print(f"Dataset-shape: ({self.n_samples}, {self.n_features})",
args, "\t", end=" ")
res.append(self.benchmark(self.to_benchmark)(**args))
if plot:
self._plot_res(res)
print()
return res
def _plot_res(self, res):
plt.figure(figsize=(21, 13))
df = pd.DataFrame()
for args, key_args, times in res:
d = pd.DataFrame(dict(times=times))
key_args_string = "\n".join(list(
map(lambda x: f"{x[0]}: {x[1]}", key_args.items())))
d['args'] = key_args_string
df = pd.concat([df, d])
ax = sns.boxplot(x="args", y="times", data=df, whis=0.4)
ax.set_ylabel("Execution time (in sec.)")
ax.xaxis.set_tick_params(rotation=45)
plt.title(f"{self.__class__.__doc__ }"
f" — dataset-shape: ({self.n_samples}, "
f"{self.n_features}) — {self.n_trials} trials")
ax.yaxis.grid(True)
plt.savefig(f"{self.__class__.__name__.lower()}_{self.n_samples}"
f"_{self.n_features}_{self.n_trials}.png")
plt.show()
plt.close()
class BenchmarkSelectKBest(Benchmark):
""" f_regression vs r_regression for features selection"""
param_names = ["scorer"]
params = [[f_regression, abs_r_regression]]
def to_benchmark(self, scorer):
SelectKBest(scorer).fit_transform(self.X, self.y)
if __name__ == "__main__":
n_trials = 100
n_features_set = [10, 100, 1000]
n_samples_set = [10_000, 100_000]
for n_samples in n_samples_set:
for n_features in n_features_set:
BenchmarkSelectKBest(
n_samples=n_samples,
n_features=n_features,
n_trials=n_trials).run(plot=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment