Skip to content

Instantly share code, notes, and snippets.

@Micky774
Created May 24, 2022 13:07
Show Gist options
  • Save Micky774/d1197421bd6564c9e156c6e37587d781 to your computer and use it in GitHub Desktop.
Save Micky774/d1197421bd6564c9e156c6e37587d781 to your computer and use it in GitHub Desktop.
Benchmark file for `dump_svmlight_file`
# %%
from time import time
import pandas as pd
def loop(func, params={}, num_trials=1):
for _ in range(num_trials):
start_time = time()
func(**params)
total_time = time()-start_time
yield total_time
def populate_array(func, params={}, num_trials=1):
return np.array(list(loop(func, params, num_trials)))
def get_stats(data, data_func = None):
extra_stats = None
if data_func:
extra_stats = data_func(data)
return data.mean(), np.std(data), extra_stats
# %%
import numpy as np
import scipy.sparse as sp
def generate_data(n_samples, n_features, X_density=1, y_sparse=False, dtype=np.float64, random_state=None):
rng = np.random.RandomState(random_state)
if X_density < 1:
X = sp.random(n_samples, n_features, format="csr", density=X_density, random_state=rng)
else:
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype)
y = np.round(rng.rand(n_samples,)+1).astype(dtype)
if y_sparse:
y = sp.csr_matrix(y)
if y_sparse and y.shape[0] == 1:
y = y.T
return X, y
# %%
from functools import partial
from time import perf_counter
from statistics import mean, stdev
from itertools import product
import csv
from sklearn.datasets import dump_svmlight_file
import numpy as np
results_path = 'local_artifacts/benchmarks/dump_svmlight/'
branch = "main"
benchmark_config = [
(
dump_svmlight_file,
partial(generate_data, n_features=100),
product(
[100, 1000, 10000],
[False, True],
),
"local_artifacts/svmd"
),
]
N_REPEATS = 20
with open(f'{results_path}{branch}.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(
csvfile,
fieldnames=[
"X_shape",
"X_sparse",
"n_repeat",
"duration",
],
)
writer.writeheader()
for func, make_data, items, dump_path in benchmark_config:
for n_samples, X_sparse in items:
time_results = []
X_shape = (n_samples, 100)
for n_repeat in range(N_REPEATS):
X, y = make_data(n_samples=n_samples, random_state=n_repeat, X_density = .01 if X_sparse else 1)
start = perf_counter()
func(X, y, f=dump_path)
duration = perf_counter() - start
time_results.append(duration)
writer.writerow(
{
"X_shape": str(X_shape),
"X_sparse": X_sparse,
"n_repeat": n_repeat,
"duration": duration,
}
)
results_mean, results_stdev = mean(time_results), stdev(time_results)
print(
f"{X_shape=} {X_sparse=} {n_samples=} |"
f" {results_mean:.3f} +/- {results_stdev:.3f}"
)
# %%
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc('font', size=12)
pr = pd.read_csv(f'{results_path}PR.csv')
main = pd.read_csv(f'{results_path}main.csv')
df = pd.concat([pr.assign(branch="pr"), main.assign(branch="main")])
grouped = list(df.groupby(["X_sparse", "X_shape"]))
fig, axis = plt.subplots(2, 3, figsize=(14, 6), constrained_layout=True)
for ((X_sparse, X_shape), subset), ax in zip(grouped, axis.reshape(-1)):
sns.violinplot(data=subset, y="duration", x="branch", ax=ax)
ax.set_title(f"{X_shape} | {X_sparse=}")
ax.set_xlabel("")
for ax in axis[:, 1:].ravel():
ax.set_ylabel("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment