jjerphan/benchmark.py

## benchmark.py
import gc
import itertools
import time

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, abs_r_regression, f_regression


class Benchmark(object):
    """
    Utility to benchmark.
    :param n_samples: number of samples of the dataset to use for the benchmark
    :param n_trials: number of trials of each test for the benchmark
    """

    def __init__(self, n_samples=100000, n_features=10, n_trials=20):
        self.n_samples = n_samples
        self.n_features = n_features
        self.n_trials = n_trials
        self.X, self.y = make_classification(
            n_samples=self.n_samples,
            n_features=self.n_features,
            n_redundant=0,
            random_state=42,
        )

    def to_benchmark(self, *args, **kwargs):
        raise NotImplementedError

    def benchmark(self, func):
        """
        Return a proxy function to run benchmark
        on the original one.
        :param func: function
        :return:
        """

        def proxy(*args, **key_args):
            times = []
            gc.collect()
            for _ in range(self.n_trials):
                t1 = time.time()
                _ = func(*args, **key_args)
                t2 = time.time()
                times.append(t2 - t1)

            mean = np.mean(times)
            std = np.std(times)
            med = np.median(times)
            print("{} trials: {:.4f} ± {:.4f} s (median = {:.4f} s)"
                  .format(self.n_trials, mean, std, med))
            return args, key_args, times

        return proxy

    def run(self, plot=True ):
        res = []
        for values in itertools.product(*self.params):
            args = dict(zip(self.param_names, values))
            print(f"Dataset-shape: ({self.n_samples}, {self.n_features})",
                  args, "\t", end=" ")
            res.append(self.benchmark(self.to_benchmark)(**args))

        if plot:
            self._plot_res(res)
        print()
        return res

    def _plot_res(self, res):
        plt.figure(figsize=(21, 13))

        df = pd.DataFrame()
        for args, key_args, times in res:
            d = pd.DataFrame(dict(times=times))
            key_args_string = "\n".join(list(
                map(lambda x: f"{x[0]}: {x[1]}", key_args.items())))

            d['args'] = key_args_string
            df = pd.concat([df, d])

        ax = sns.boxplot(x="args", y="times", data=df, whis=0.4)

        ax.set_ylabel("Execution time (in sec.)")
        ax.xaxis.set_tick_params(rotation=45)
        plt.title(f"{self.__class__.__doc__ }"
                  f" — dataset-shape: ({self.n_samples}, "
                  f"{self.n_features}) — {self.n_trials} trials")
        ax.yaxis.grid(True)

        plt.savefig(f"{self.__class__.__name__.lower()}_{self.n_samples}"
                    f"_{self.n_features}_{self.n_trials}.png")

        plt.show()
        plt.close()


class BenchmarkSelectKBest(Benchmark):
    """ f_regression vs r_regression for features selection"""

    param_names = ["scorer"]
    params = [[f_regression, abs_r_regression]]

    def to_benchmark(self, scorer):
        SelectKBest(scorer).fit_transform(self.X, self.y)


if __name__ == "__main__":
    n_trials = 100

    n_features_set = [10, 100, 1000]
    n_samples_set = [10_000, 100_000]

    for n_samples in n_samples_set:
        for n_features in n_features_set:

            BenchmarkSelectKBest(
                n_samples=n_samples,
                n_features=n_features,
                n_trials=n_trials).run(plot=False)
	import gc
	import itertools
	import time

	import numpy as np
	import pandas as pd
	import seaborn as sns
	from matplotlib import pyplot as plt

	from sklearn.datasets import make_classification
	from sklearn.feature_selection import SelectKBest, abs_r_regression, f_regression


	class Benchmark(object):
	"""
	Utility to benchmark.
	:param n_samples: number of samples of the dataset to use for the benchmark
	:param n_trials: number of trials of each test for the benchmark
	"""

	def __init__(self, n_samples=100000, n_features=10, n_trials=20):
	self.n_samples = n_samples
	self.n_features = n_features
	self.n_trials = n_trials
	self.X, self.y = make_classification(
	n_samples=self.n_samples,
	n_features=self.n_features,
	n_redundant=0,
	random_state=42,
	)

	def to_benchmark(self, args, *kwargs):
	raise NotImplementedError

	def benchmark(self, func):
	"""
	Return a proxy function to run benchmark
	on the original one.
	:param func: function
	:return:
	"""

	def proxy(args, *key_args):
	times = []
	gc.collect()
	for _ in range(self.n_trials):
	t1 = time.time()
	_ = func(args, *key_args)
	t2 = time.time()
	times.append(t2 - t1)

	mean = np.mean(times)
	std = np.std(times)
	med = np.median(times)
	print("{} trials: {:.4f} ± {:.4f} s (median = {:.4f} s)"
	.format(self.n_trials, mean, std, med))
	return args, key_args, times

	return proxy

	def run(self, plot=True ):
	res = []
	for values in itertools.product(*self.params):
	args = dict(zip(self.param_names, values))
	print(f"Dataset-shape: ({self.n_samples}, {self.n_features})",
	args, "\t", end=" ")
	res.append(self.benchmark(self.to_benchmark)(**args))

	if plot:
	self._plot_res(res)
	print()
	return res

	def _plot_res(self, res):
	plt.figure(figsize=(21, 13))

	df = pd.DataFrame()
	for args, key_args, times in res:
	d = pd.DataFrame(dict(times=times))
	key_args_string = "\n".join(list(
	map(lambda x: f"{x[0]}: {x[1]}", key_args.items())))

	d['args'] = key_args_string
	df = pd.concat([df, d])

	ax = sns.boxplot(x="args", y="times", data=df, whis=0.4)

	ax.set_ylabel("Execution time (in sec.)")
	ax.xaxis.set_tick_params(rotation=45)
	plt.title(f"{self.__class__.__doc__ }"
	f" — dataset-shape: ({self.n_samples}, "
	f"{self.n_features}) — {self.n_trials} trials")
	ax.yaxis.grid(True)

	plt.savefig(f"{self.__class__.__name__.lower()}_{self.n_samples}"
	f"_{self.n_features}_{self.n_trials}.png")

	plt.show()
	plt.close()


	class BenchmarkSelectKBest(Benchmark):
	""" f_regression vs r_regression for features selection"""

	param_names = ["scorer"]
	params = [[f_regression, abs_r_regression]]

	def to_benchmark(self, scorer):
	SelectKBest(scorer).fit_transform(self.X, self.y)


	if __name__ == "__main__":
	n_trials = 100

	n_features_set = [10, 100, 1000]
	n_samples_set = [10_000, 100_000]

	for n_samples in n_samples_set:
	for n_features in n_features_set:

	BenchmarkSelectKBest(
	n_samples=n_samples,
	n_features=n_features,
	n_trials=n_trials).run(plot=False)