Skip to content

Instantly share code, notes, and snippets.

@adrialuzllompart
Last active November 26, 2017 16:22
Show Gist options
  • Save adrialuzllompart/c311d833eba5ebff3d6c241308eb7101 to your computer and use it in GitHub Desktop.
Save adrialuzllompart/c311d833eba5ebff3d6c241308eb7101 to your computer and use it in GitHub Desktop.
Nested cross validation implementation
# convert X and y to numpy arrays
X = X.as_matrix()
y = y.as_matrix()
# create stratified k-fold split generators for inner and outer loops
outer_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
inner_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
# set up hyperparameter tuning
Cs = 10.0 ** np.arange(-4,3)
weights = [None, 'balanced']
thresholds = np.linspace(0.3, 0.7, 9)
grid = []
for c in Cs:
for w in weights:
for t in thresholds:
grid.append({
'C': c,
'weight': w,
'threshold': t
})
# run hyperparameter tuning inside an outer cross-validation loop
cv_scores = []
# start the outer loop
for outer_train, outer_test in outer_kf.split(X, y):
# apply indices
outer_train_set_X = X[outer_train, :]
outer_test_set_X = X[outer_test, :]
outer_train_set_y = y[outer_train]
outer_test_set_y = y[outer_test]
# start the grid loop
grid_results = [] # list to store the results for all combinations of hyperparameters
for g in grid:
# start the inner loop
inner_cv_fb_score = [] # list to store each fb_score result for the inner cv loop
inner_cv_recall = []
inner_cv_precision = []
for inner_train, validation in inner_kf.split(outer_train_set_X, outer_train_set_y):
# apply indices
inner_train_set_X = outer_train_set_X[inner_train, :]
inner_validation_set_X = outer_train_set_X[validation, :]
inner_train_set_y = outer_train_set_y[inner_train]
inner_validation_set_y = outer_train_set_y[validation]
# fit the classifier based on the C and weight hyper parameters
clf = lm.LogisticRegression(C=g['C'], class_weight=g['weight'], random_state=123)
clf = Pipeline([
('scaler', preprocessing.StandardScaler()),
('model', clf)
])
clf.fit(inner_train_set_X, inner_train_set_y)
# score the validation set
predicted_prob = clf.predict_proba(inner_validation_set_X)[:, 1]
preds = (predicted_prob >= g['threshold']).astype('int')
fb_score = metrics.fbeta_score(inner_validation_set_y, preds, beta=2)
recall = metrics.recall_score(inner_validation_set_y, preds)
precision = metrics.precision_score(inner_validation_set_y, preds)
inner_cv_fb_score.append(fb_score)
inner_cv_recall.append(recall)
inner_cv_precision.append(precision)
# get the average score
fb_score_avg = np.mean(inner_cv_fb_score)
recall_score_avg = np.mean(inner_cv_recall)
precision_score_avg = np.mean(inner_cv_precision)
# now log the average scores against the hyperparameters
grid_results.append({
'C': g['C'],
'weight': g['weight'],
'threshold': g['threshold'],
'cv_fb_score': fb_score_avg,
'cv_recall_score': recall_score_avg,
'cv_precision_score': precision_score_avg
})
# select the best model
best_model = max([x for x in grid_results], key=lambda item: item['cv_fb_score'])
# fit the classifier
clf = lm.LogisticRegression(C=best_model['C'], class_weight=best_model['weight'], random_state=123)
clf = Pipeline([
('scaler', preprocessing.StandardScaler()),
('model', clf)
])
clf.fit(outer_train_set_X, outer_train_set_y)
# score the test set
predicted_prob = clf.predict_proba(outer_test_set_X)[:, 1]
preds = (predicted_prob >= best_model['threshold']).astype('int')
fb_score = metrics.fbeta_score(outer_test_set_y, preds, beta=2)
recall = metrics.recall_score(outer_test_set_y, preds)
precision = metrics.precision_score(outer_test_set_y, preds)
cv_scores.append({
'C': best_model['C'],
'weight': best_model['weight'],
'threshold': best_model['threshold'],
'fb_score': fb_score,
'recall_score': recall,
'precision_score': precision
})
# finally average all the outer CV fb_score scores to get the cv estimate
estimated_fb_score = np.mean([x['fb_score'] for x in cv_scores])
estimated_recall_score = np.mean([x['recall_score'] for x in cv_scores])
estimated_precision_score = np.mean([x['precision_score'] for x in cv_scores])
print 'Overall estimated fb_score =', estimated_fb_score
print 'Overall estimated recall_score =', estimated_recall_score
print 'Overall estimated precision_score =', estimated_precision_score
# this is the best model
best_model
# create DataFrame with the grid results
results = pd.DataFrame(grid_results)
# visualise the efect of the hyperparameters with a heatmap
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(14,5))
sns.heatmap(results.loc[results.weight != 'balanced', :]\
.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
annot=True, ax=ax0)
ax0.set(title='class_weight=None')
sns.heatmap(results.loc[results.weight == 'balanced', :]\
.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
annot=True, ax=ax1)
ax1.set(title='class_weight=balanced')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment