Skip to content

Instantly share code, notes, and snippets.

@jjerphan
Created June 6, 2020 11:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjerphan/4d10b9aad5573245bbd93da2e3fb1f23 to your computer and use it in GitHub Desktop.
Save jjerphan/4d10b9aad5573245bbd93da2e3fb1f23 to your computer and use it in GitHub Desktop.
Benchmark for scikit-learn #17107 (https://github.com/scikit-learn/scikit-learn/pull/17107)
import numpy as np
import itertools
import time
import seaborn as sns
import gc
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
class CalibratedClassifierCVBenchmark(object):
"""
Utilities to benchmark CalibratedClassifierCV.
:param n_samples: number of samples of the dataset to use for the benchmark
:param n_trials: number of trials of each test for the benchmark
"""
def __init__(self, n_samples=100000, n_features=10, n_trials=20):
self.n_samples = n_samples
self.n_features = n_features
self.n_trials = n_trials
self.X, self.y = make_classification(
n_samples=self.n_samples,
n_features=self.n_features,
n_redundant=0,
random_state=42,
)
def to_benchmark(self, *args, **kwargs):
raise NotImplementedError
def benchmark(self, func):
"""
Return a proxy function to run benchmark
on the original one.
:param func: function
:return:
"""
def proxy(*args, **key_args):
times = []
gc.collect()
for _ in range(self.n_trials):
t1 = time.time()
_ = func(*args, **key_args)
t2 = time.time()
times.append(t2 - t1)
mean = np.mean(times)
std = np.std(times)
print("{} trials: {:.4f} ± {:.4f} s"
.format(self.n_trials, mean, std))
print(times)
return args, key_args, times
return proxy
def _run_benchmark(self):
res = []
for values in itertools.product(*self.params):
args = dict(zip(self.param_names, values))
print(self.__class__.__name__, args, end=" ")
res.append(self.benchmark(self.to_benchmark)(**args))
print()
return res
def _plot_res(self, benchmark_class, res):
plt.figure(figsize=(21, 13))
df = pd.DataFrame()
for args, key_args, times in res:
d = pd.DataFrame(dict(times=times))
key_args_string = "\n".join(list(
map(lambda x: f"{x[0]}: {x[1]}", key_args.items())))
d['args'] = key_args_string
df = pd.concat([df, d])
ax = sns.boxplot(x="args", y="times", data=df, whis=0.4)
ax.set_ylabel("Execution time (in sec.)")
ax.xaxis.set_tick_params(rotation=45)
plt.title(f"{benchmark_class.__doc__ }"
f" — dataset-shape: ({self.n_samples}, "
f"{self.n_features}) — {self.n_trials} trials")
ax.yaxis.grid(True)
plt.savefig(f"{benchmark_class.__name__.lower()}_{self.n_samples}"
f"_{self.n_features}_{self.n_trials}.png")
# plt.show()
plt.close()
def run_all(self):
"""
Run all the benchmark defined in classes
extending CalibratedClassifierCVBenchmark.
:return:
"""
for benchmark_class in self.__class__.__subclasses__():
res = benchmark_class()._run_benchmark()
self._plot_res(benchmark_class, res)
print()
class BenchmarkNJobsSingleThreadAlgo(CalibratedClassifierCVBenchmark):
""" Time vs single-threaded algo. and threads numbers."""
ESTIMATORS = {
"LogisticReg.": LogisticRegression(),
"CART": DecisionTreeClassifier(),
}
params = [list(ESTIMATORS), [1, 2, 4, 8]]
param_names = ["estimator_name", "n_jobs"]
def to_benchmark(self, estimator_name, n_jobs):
clf = self.ESTIMATORS[estimator_name]
clf_c = CalibratedClassifierCV(clf, n_jobs=n_jobs)
clf_c.fit(self.X, self.y)
if __name__ == "__main__":
n_trials = 20
n_features_set = [10, 100]
n_samples_set = [100_000]
for n_features in n_features_set:
for n_samples in n_samples_set:
CalibratedClassifierCVBenchmark(n_samples=n_samples,
n_features=n_features,
n_trials=n_trials).run_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment