Skip to content

Instantly share code, notes, and snippets.

@Micky774
Created May 1, 2022 22:35
Show Gist options
  • Save Micky774/086d4e6f03c5fc9647781db06078366a to your computer and use it in GitHub Desktop.
Save Micky774/086d4e6f03c5fc9647781db06078366a to your computer and use it in GitHub Desktop.
Benchmark file for the cythonized `dump_svmlight_file`
from time import time
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.datasets import dump_svmlight_file
def loop(func, params={}, num_trials=1):
for _ in range(num_trials):
start_time = time()
func(**params)
total_time = time()-start_time
yield total_time
def populate_array(func, params={}, num_trials=1):
return np.array(list(loop(func, params, num_trials)))
def get_stats(data, data_func = None):
extra_stats = None
if data_func:
extra_stats = data_func(data)
return data.mean(), np.std(data), extra_stats
def generate_data(n_samples, n_features, X_sparse=False, y_sparse=False, dtype=np.float64):
rng = np.random.RandomState(42)
X = np.round(rng.rand(n_samples,n_features)*50).astype(dtype)
if X_sparse:
X = sp.csr_matrix(X)
y = np.round(rng.rand(n_samples,)+1).astype(dtype)
if y_sparse:
y = sp.csr_matrix(y)
if y_sparse and y.shape[0] == 1:
y = y.T
#query_id = np.arange(n_samples) // 2
return X, y
path = "local_artifacts/svmd_"
rows = []
for i in range(3):
for j in range(2):
X, y = generate_data(int(10**(i+2)), int(10**(j+2)))
kwargs = {"X":X, "y":y, "f":path+"1"}
data = populate_array(dump_svmlight_file, kwargs, 7)
mean, std, _ = get_stats(data)
row = {"shape":str(X.shape), "branch":"main", "mean":mean, "std":std}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv("local_artifacts/svmlight_bench.csv", index_label=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment