Last active
November 26, 2017 16:22
-
-
Save adrialuzllompart/c311d833eba5ebff3d6c241308eb7101 to your computer and use it in GitHub Desktop.
Nested cross validation implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# convert X and y to numpy arrays | |
X = X.as_matrix() | |
y = y.as_matrix() | |
# create stratified k-fold split generators for inner and outer loops | |
outer_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12) | |
inner_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12) | |
# set up hyperparameter tuning | |
Cs = 10.0 ** np.arange(-4,3) | |
weights = [None, 'balanced'] | |
thresholds = np.linspace(0.3, 0.7, 9) | |
grid = [] | |
for c in Cs: | |
for w in weights: | |
for t in thresholds: | |
grid.append({ | |
'C': c, | |
'weight': w, | |
'threshold': t | |
}) | |
# run hyperparameter tuning inside an outer cross-validation loop | |
cv_scores = [] | |
# start the outer loop | |
for outer_train, outer_test in outer_kf.split(X, y): | |
# apply indices | |
outer_train_set_X = X[outer_train, :] | |
outer_test_set_X = X[outer_test, :] | |
outer_train_set_y = y[outer_train] | |
outer_test_set_y = y[outer_test] | |
# start the grid loop | |
grid_results = [] # list to store the results for all combinations of hyperparameters | |
for g in grid: | |
# start the inner loop | |
inner_cv_fb_score = [] # list to store each fb_score result for the inner cv loop | |
inner_cv_recall = [] | |
inner_cv_precision = [] | |
for inner_train, validation in inner_kf.split(outer_train_set_X, outer_train_set_y): | |
# apply indices | |
inner_train_set_X = outer_train_set_X[inner_train, :] | |
inner_validation_set_X = outer_train_set_X[validation, :] | |
inner_train_set_y = outer_train_set_y[inner_train] | |
inner_validation_set_y = outer_train_set_y[validation] | |
# fit the classifier based on the C and weight hyper parameters | |
clf = lm.LogisticRegression(C=g['C'], class_weight=g['weight'], random_state=123) | |
clf = Pipeline([ | |
('scaler', preprocessing.StandardScaler()), | |
('model', clf) | |
]) | |
clf.fit(inner_train_set_X, inner_train_set_y) | |
# score the validation set | |
predicted_prob = clf.predict_proba(inner_validation_set_X)[:, 1] | |
preds = (predicted_prob >= g['threshold']).astype('int') | |
fb_score = metrics.fbeta_score(inner_validation_set_y, preds, beta=2) | |
recall = metrics.recall_score(inner_validation_set_y, preds) | |
precision = metrics.precision_score(inner_validation_set_y, preds) | |
inner_cv_fb_score.append(fb_score) | |
inner_cv_recall.append(recall) | |
inner_cv_precision.append(precision) | |
# get the average score | |
fb_score_avg = np.mean(inner_cv_fb_score) | |
recall_score_avg = np.mean(inner_cv_recall) | |
precision_score_avg = np.mean(inner_cv_precision) | |
# now log the average scores against the hyperparameters | |
grid_results.append({ | |
'C': g['C'], | |
'weight': g['weight'], | |
'threshold': g['threshold'], | |
'cv_fb_score': fb_score_avg, | |
'cv_recall_score': recall_score_avg, | |
'cv_precision_score': precision_score_avg | |
}) | |
# select the best model | |
best_model = max([x for x in grid_results], key=lambda item: item['cv_fb_score']) | |
# fit the classifier | |
clf = lm.LogisticRegression(C=best_model['C'], class_weight=best_model['weight'], random_state=123) | |
clf = Pipeline([ | |
('scaler', preprocessing.StandardScaler()), | |
('model', clf) | |
]) | |
clf.fit(outer_train_set_X, outer_train_set_y) | |
# score the test set | |
predicted_prob = clf.predict_proba(outer_test_set_X)[:, 1] | |
preds = (predicted_prob >= best_model['threshold']).astype('int') | |
fb_score = metrics.fbeta_score(outer_test_set_y, preds, beta=2) | |
recall = metrics.recall_score(outer_test_set_y, preds) | |
precision = metrics.precision_score(outer_test_set_y, preds) | |
cv_scores.append({ | |
'C': best_model['C'], | |
'weight': best_model['weight'], | |
'threshold': best_model['threshold'], | |
'fb_score': fb_score, | |
'recall_score': recall, | |
'precision_score': precision | |
}) | |
# finally average all the outer CV fb_score scores to get the cv estimate | |
estimated_fb_score = np.mean([x['fb_score'] for x in cv_scores]) | |
estimated_recall_score = np.mean([x['recall_score'] for x in cv_scores]) | |
estimated_precision_score = np.mean([x['precision_score'] for x in cv_scores]) | |
print 'Overall estimated fb_score =', estimated_fb_score | |
print 'Overall estimated recall_score =', estimated_recall_score | |
print 'Overall estimated precision_score =', estimated_precision_score | |
# this is the best model | |
best_model | |
# create DataFrame with the grid results | |
results = pd.DataFrame(grid_results) | |
# visualise the efect of the hyperparameters with a heatmap | |
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(14,5)) | |
sns.heatmap(results.loc[results.weight != 'balanced', :]\ | |
.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'), | |
annot=True, ax=ax0) | |
ax0.set(title='class_weight=None') | |
sns.heatmap(results.loc[results.weight == 'balanced', :]\ | |
.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'), | |
annot=True, ax=ax1) | |
ax1.set(title='class_weight=balanced') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment