Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Last active December 4, 2020 16:48
Show Gist options
  • Save ogrisel/955234a13d9d6bb9bf35e2b2fe65e296 to your computer and use it in GitHub Desktop.
Save ogrisel/955234a13d9d6bb9bf35e2b2fe65e296 to your computer and use it in GitHub Desktop.
import os
os.environ["OMP_NUM_THREADS"] = "1" # avoid oversubscription
import pandas as pd
from distributed.client import performance_report
from time import perf_counter
from joblib import Memory, parallel_backend
from distributed import Client, LocalCluster
from sklearn.datasets import make_regression
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
make_regression = Memory(location="/tmp").cache(make_regression)
param_grid = {
"max_iter": [10, 30, 50, 100, 300, 500],
"max_leaf_nodes": [3, 5, 7, 11, 31, 77, 151],
"learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
"n_iter_no_change": [5, 10, 50, 100],
}
def run_grid_search():
X, y = make_regression(n_samples=int(1e5), n_features=100, random_state=0)
with parallel_backend("dask"):
search = RandomizedSearchCV(
HistGradientBoostingRegressor(),
param_grid,
n_iter=100,
cv=5,
verbose=100,
random_state=0,
)
return search.fit(X, y)
if __name__ == "__main__":
cluster = LocalCluster(n_workers=8, processes=True, threads_per_worker=1)
client = Client(cluster)
print("OMP_NUM_THREADS on dask workers:",
client.submit(lambda: os.environ["OMP_NUM_THREADS"]).result())
with performance_report("dask_perf_report.html"):
t0 = perf_counter()
search_cv = client.submit(run_grid_search).result()
print(f"duration={perf_counter() - t0:.3f}s")
cols = ["params", "mean_test_score"]
df = pd.DataFrame(search_cv.cv_results_)[cols]
print(df.sort_values("mean_test_score", ascending=False).head(10))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment