Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created August 28, 2019 11:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ogrisel/254f959c2122ddfb84bbc57f46f9b5a5 to your computer and use it in GitHub Desktop.
Save ogrisel/254f959c2122ddfb84bbc57f46f9b5a5 to your computer and use it in GitHub Desktop.
from time import time
from pprint import pprint
import numpy as np
import pandas as pd
from scipy.stats import expon, randint, uniform
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/adult-census.csv")
target_name = "class"
target = df[target_name].to_numpy()
data = df.drop(columns=target_name)
data = data.drop(columns="fnlwgt")
data_train, data_test, target_train, target_test = train_test_split(
data, target, test_size=0.1, random_state=0)
categorical_columns = [
'workclass', 'education', 'marital-status', 'occupation', 'relationship',
'race', 'sex', 'native-country'
]
categories = [data[column].unique() for column in data[categorical_columns]]
preprocessor = ColumnTransformer([
('categorical', OrdinalEncoder(categories=categories), categorical_columns),
], remainder="passthrough")
max_iter = 100
n_candidates = 100
n_jobs = 1
random_state = 0
cv = 5
model = Pipeline(
[('preprocessor', preprocessor),
('gbrt', HistGradientBoostingClassifier(max_iter=max_iter))]
)
param_distributions = {
'gbrt__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1., 2., 5.],
'gbrt__l2_regularization': [0, 0.001, 0.01, 0.1, 1., 10., 100., 1e3],
'gbrt__max_bins': [32, 64, 128, 255],
'gbrt__max_leaf_nodes': [3, 5, 10, 30, 50, 100, 300, 500],
'gbrt__min_samples_leaf': [1, 3, 5, 10, 30, 50, 100, 300]
}
param_search = RandomizedSearchCV(
model, param_distributions=param_distributions, n_iter=n_candidates, cv=cv,
random_state=random_state, n_jobs=n_jobs, verbose=1
)
param_search = HalvingRandomSearchCV(
estimator=model,
param_distributions=param_distributions,
n_candidates=n_candidates,
resource='gbrt__max_iter',
min_resources=10,
max_resources=max_iter,
ratio=10,
cv=cv,
random_state=random_state, n_jobs=n_jobs, verbose=1
)
param_search = HalvingRandomSearchCV(
estimator=model,
param_distributions=param_distributions,
n_candidates=n_candidates,
resource='n_samples',
min_resources=data_train.shape[0] // 10,
max_resources=data_train.shape[0],
ratio=10,
cv=cv,
random_state=random_state, n_jobs=n_jobs, verbose=1
)
t0 = time()
param_search.fit(data_train, target_train)
search_duration = time() - t0
print(
f"The accuracy score using a {param_search.__class__.__name__} is "
f"{param_search.score(data_test, target_test):.4f} in "
f"{search_duration:.3f}s"
)
print(f"The best set of parameters is: {param_search.best_params_} "
f"(CV score: {param_search.best_score_:.4f})")
search_results_df = results_df = pd.DataFrame(param_search.cv_results_)
if isinstance(param_search, HalvingRandomSearchCV):
search_results_df = search_results_df.sort_values(["iter", "mean_test_score"],
ascending=False)
else:
search_results_df = search_results_df.sort_values("mean_test_score",
ascending=False)
for rank, (i, entry) in enumerate(search_results_df.iterrows()):
if rank >= 5:
break
print(f"rank: {rank + 1}, score: {entry['mean_test_score']:.4f}")
pprint(entry["params"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment