|
import numpy as np |
|
import pandas as pd |
|
import seaborn as sns |
|
import threadpoolctl |
|
import subprocess |
|
|
|
from io import StringIO |
|
from time import perf_counter |
|
from matplotlib import pyplot as plt |
|
from sklearn.neighbors import NearestNeighbors |
|
|
|
from joblib import Memory |
|
memory = Memory(".tmp", verbose=0) |
|
|
|
commit = ( |
|
subprocess |
|
.check_output(['git', 'rev-parse', '--short', 'HEAD']) |
|
.decode('ascii') |
|
.strip() |
|
) |
|
|
|
@memory.cache |
|
def execute_bench( |
|
n_train=100, |
|
n_test=100, |
|
n_features_list=None, |
|
n_threads_list=None, |
|
): |
|
rng = np.random.RandomState(0) |
|
|
|
n_features_list = n_features_list or [50, 100, 500] |
|
n_threads_list = n_threads_list or [1, 2, 4, 8, 16, 32, 64, 128] |
|
|
|
lists = [] |
|
|
|
controler = threadpoolctl.ThreadpoolController() |
|
|
|
for n_features in n_features_list: |
|
X_train = rng.rand(n_train, n_features) |
|
X_test = rng.rand(n_test, n_features) |
|
|
|
for n_threads in n_threads_list: |
|
with controler.limit(limits=n_threads, user_api=None): |
|
nn = NearestNeighbors(radius=np.log(n_features), algorithm='brute', n_jobs=n_threads) |
|
nn.fit(X_train) |
|
start = perf_counter() |
|
nn.radius_neighbors(X_test, return_distance=True) |
|
end = perf_counter() |
|
lists.append([n_threads, n_train, n_test, n_features, end - start, 0]) |
|
|
|
columns = [ |
|
"n_threads", |
|
"n_train", |
|
"n_test", |
|
"n_features", |
|
"mean_runtime", |
|
"stderr_runtime", |
|
] |
|
return pd.DataFrame(lists, columns=columns) |
|
|
|
|
|
def plot_results(df, save=False, n_train=None, n_test=None): |
|
|
|
fig = plt.figure(figsize=(30, 15)) |
|
|
|
ax = plt.gca() |
|
|
|
df_p = df.query("n_features == 50") |
|
|
|
ax.loglog( |
|
df_p["n_threads"], |
|
df_p["n_threads"], |
|
linestyle="--", |
|
color="black", |
|
label="linear", |
|
alpha=.5, |
|
) |
|
|
|
speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"] |
|
ax.loglog( |
|
df_p["n_threads"], |
|
speed_up, |
|
color="yellow", |
|
label="50 features", |
|
alpha=.5, |
|
) |
|
|
|
df_p = df.query("n_features == 100") |
|
speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"] |
|
ax.loglog( |
|
df_p["n_threads"], |
|
speed_up, |
|
color="blue", |
|
label="100 features", |
|
alpha=.5, |
|
) |
|
|
|
df_p = df.query("n_features == 500") |
|
speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"] |
|
ax.loglog( |
|
df_p["n_threads"], |
|
speed_up, |
|
color="red", |
|
label="500 features", |
|
alpha=.5, |
|
) |
|
|
|
ax.set( |
|
xlabel="Number of threads", |
|
ylabel="Speed-up", |
|
xticks=df["n_threads"], |
|
xticklabels=df["n_threads"], |
|
yticks=df["n_threads"], |
|
yticklabels=[f"×{i}" for i in df["n_threads"]], |
|
) |
|
plt.legend() |
|
title= ( |
|
f"Scalability of sklearn.NearestNeighbors.radius_neighbors " |
|
f" - (method, n_train, n_test) = ('brute', {n_train}, {n_test})" |
|
f" - commit: {commit}" |
|
) |
|
plt.title(title) |
|
if save: |
|
plt.savefig(f'speed_up_{n_train}_{n_test}_log.png') |
|
else: |
|
plt.show() |
|
|
|
|
|
if __name__ == "__main__": |
|
columns = [ |
|
"n_threads", |
|
"n_train", |
|
"n_test", |
|
"n_features", |
|
"mean_runtime", |
|
"stderr_runtime", |
|
] |
|
|
|
n_train = 100_000 |
|
n_test = 100_000 |
|
df = execute_bench(n_train, n_test) |
|
|
|
print(df) |
|
plot_results(df, save=True, n_train=n_train, n_test=n_test) |
With mimalloc:
Raw results
Without mimalloc:
Raw results