Skip to content

Instantly share code, notes, and snippets.

@bgalvao
Last active August 13, 2020 15:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bgalvao/94be77acdd13f9879eda4f7136195011 to your computer and use it in GitHub Desktop.
Save bgalvao/94be77acdd13f9879eda4f7136195011 to your computer and use it in GitHub Desktop.
Grid Search Examples
import datetime
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
# meta params
UNSEEN_TEST_SIZE = .2
RANDOM_STATE = None
# -----------------------------------------------------------------------------
# set up test set
x_train, y_train, x_test, y_test = None, None, None, None
train_idx, test_idx = None, None
results = None
def split_train_test(x, y):
global x_train, y_train, x_test, y_test, train_idx, test_idx
splitter = StratifiedShuffleSplit(
n_splits=1, test_size=UNSEEN_TEST_SIZE,
random_state=RANDOM_STATE
)
for train_idx, test_idx in splitter.split(x, y):
x_train, y_train = x.iloc[train_idx], y[train_idx]
x_test, y_test = x.iloc[test_idx], y[test_idx]
return x_train, y_train, x_test, y_test, train_idx, test_idx
# -----------------------------------------------------------------------------
def make_cvgen():
return StratifiedShuffleSplit(
n_splits=30, train_size=.84, random_state=RANDOM_STATE
) # can fix a random state
def grid_search(estimator, param_grid):
cross_generator = make_cvgen()
return GridSearchCV(
estimator=estimator, param_grid=param_grid, cv=cross_generator,
n_jobs=3, scoring=['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
refit='roc_auc', return_train_score=True,
verbose=9
)
#a
# -----------------------------------------------------------------------------
pkl_save_name = None
def make_results_d(x):
global results, pkl_save_name, nau
now = datetime.datetime.now()
nau = now.__str__()
results = {
'meta': {
'datetime': nau,
'test_split': UNSEEN_TEST_SIZE,
'train_idx': train_idx,
'test_idx': test_idx,
'cv_gen': make_cvgen(),
'input_features': [col for col in x.columns],
'random_state': RANDOM_STATE
},
'refit': {}
}
pkl_save_name = '../results/{} x{} s{}.pkl'\
.format(nau[:16], len(results['meta']['input_features']), RANDOM_STATE)\
.replace(' ', '_')
def save_pickle():
with open(pkl_save_name, 'wb') as f:
pickle.dump(results, f)
# -----------------------------------------------------------------------------
from sklearn.naive_bayes import GaussianNB
def gs_bayes():
print('naive_bayes')
print(x_train is not None)
return grid_search(
estimator=GaussianNB(),
param_grid={
'var_smoothing': [.05, .1, .15, .2, .25]
}
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.neighbors import KNeighborsClassifier
def gs_knn():
print('knn')
return grid_search(
estimator = KNeighborsClassifier(metric='euclidean'),
param_grid = {
'n_neighbors': [4, 5, 6, 7, 8, 9, 10],
'weights': ['uniform', 'distance'],
'p': [1, 2, 3] # parameter p of minkowski distance
}
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.linear_model import LogisticRegression
def gs_logreg():
print('logreg')
return grid_search(
LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
param_grid = [
{
'penalty': ['elasticnet'], 'l1_ratio': np.arange(.0, 1.1, .1), # i.e. spectrum from l2 to l1
'fit_intercept': [True, False], 'class_weight': [None, 'balanced']
},
{
'penalty': ['none'], 'fit_intercept': [True, False],
'class_weight': [None, 'balanced']
}
]
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.svm import NuSVC
def gs_svm():
print('svm')
common_params = {
# 'C': [.01, 1.0, 10.0, 100.0, 1000.0], # regularization
'nu': np.arange(.1, 1.1, .1), # affects number of vectors
'shrinking': [True, False],
'probability': [True, False], # uses internal 5 fold cross validation..
'class_weight': [None, 'balanced'],
# 'random_state': [None]
}
# check resulting gammas from several runs
gammas = {'gamma':['scale', 'auto'] # + np.arange(.1, 3.2, .5)) # add custom range here
}
coef0s = {'coef0': np.arange(.0, 3.1, .5)}
# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html very cool
return grid_search(
NuSVC(random_state=RANDOM_STATE),
param_grid = [
# poly kernel
{**{'kernel':['poly'], 'degree' : [2, 3, 4],}, **coef0s, **common_params, **gammas},
# sigmoid kernel
{**{'kernel':['sigmoid']}, **coef0s, **common_params, **gammas},
# rbf
{**{'kernel':['rbf']}, **common_params, **gammas},
# linear
{'kernel':['linear']}
]
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.neural_network import MLPClassifier
def gs_nn():
print('nn')
return grid_search(
# using default solver lbfgs that sets a lot for me
estimator = MLPClassifier(verbose=1, max_iter=300, random_state=RANDOM_STATE),
param_grid = {
'hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
'alpha': [10e-4, 10e-3, 10e-2, 10e-1, 1.0, 10e1, 10e2],
'solver': ['lbfgs']
}
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier
def gs_rf():
print('random forest')
return grid_search(
estimator=RandomForestClassifier(random_state=RANDOM_STATE),
param_grid={
'n_estimators': [175, 250, 325],
'criterion': ['gini', 'entropy'],
'min_samples_split': [2, 5, 7, 10], # influences max depth
'max_features': ['sqrt', 'log2'], # max_features
'min_impurity_decrease': [.0, .3],
'class_weight': ['balanced', 'balanced_subsample'],
}
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.ensemble import GradientBoostingClassifier
def gs_gb():
print('gradient boosting')
return grid_search(
estimator=GradientBoostingClassifier(random_state=RANDOM_STATE),
param_grid={
"loss":['deviance', 'exponential'], # exponential == adaboost
"learning_rate": [0.1, 0.2], # bate nos .1 consistentemente
"min_samples_split": np.linspace(0.1, 0.5, 2), # bate nos .1 consistentemente
"min_samples_leaf": np.linspace(0.1, 0.5, 2), # bate nos .1 consistentemente
# "max_depth": [3,5,8],
"max_features":["log2","sqrt"],
"criterion": ["friedman_mse"],
"subsample":[0.3, 0.5, 0.8], # bate nos 0.8
"n_estimators":[300, 400, 500]
}
).fit(x_train, y_train)
# -----------------------------------------------------------------------------
from sklearn.ensemble import StackingClassifier
def stack_ensemble():
best_estimators = {
k:(k, v.best_estimator_) for k, v in results.items() \
if type(v) == GridSearchCV
}
estimators = [
best_estimators['svm'],
# best_estimators['nn'],
# best_estimators['rf'],
best_estimators['gb']
]
return StackingClassifier(
estimators, n_jobs=-1, verbose=9,
# when final_estimator is None, it uses a LogisticRegression as final model
final_estimator = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
).fit(x_train, y_train)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment