adrialuzllompart/cross_validation.py

## cross_validation.py
# convert X and y to numpy arrays
X = X.as_matrix()
y = y.as_matrix()

# create stratified k-fold split generators for inner and outer loops
outer_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
inner_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)

# set up hyperparameter tuning
Cs = 10.0 ** np.arange(-4,3)
weights = [None, 'balanced']
thresholds = np.linspace(0.3, 0.7, 9)

grid = []
for c in Cs:
    for w in weights:
        for t in thresholds:
            grid.append({
                'C': c,
                'weight': w,
                'threshold': t
            })

# run hyperparameter tuning inside an outer cross-validation loop
cv_scores = []

# start the outer loop
for outer_train, outer_test in outer_kf.split(X, y):

    # apply indices
    outer_train_set_X = X[outer_train, :]
    outer_test_set_X = X[outer_test, :]
    outer_train_set_y = y[outer_train]
    outer_test_set_y = y[outer_test]

    # start the grid loop
    grid_results = [] # list to store the results for all combinations of hyperparameters
    for g in grid:

        # start the inner loop
        inner_cv_fb_score = [] # list to store each fb_score result for the inner cv loop
        inner_cv_recall = []
        inner_cv_precision = []
        for inner_train, validation in inner_kf.split(outer_train_set_X, outer_train_set_y):

            # apply indices
            inner_train_set_X = outer_train_set_X[inner_train, :]
            inner_validation_set_X = outer_train_set_X[validation, :]
            inner_train_set_y = outer_train_set_y[inner_train]
            inner_validation_set_y = outer_train_set_y[validation]

            # fit the classifier based on the C and weight hyper parameters
            clf = lm.LogisticRegression(C=g['C'], class_weight=g['weight'], random_state=123)
            clf = Pipeline([
                           ('scaler', preprocessing.StandardScaler()),
                           ('model', clf)
            ])
            clf.fit(inner_train_set_X, inner_train_set_y)

            # score the validation set
            predicted_prob = clf.predict_proba(inner_validation_set_X)[:, 1]
            preds = (predicted_prob >= g['threshold']).astype('int')
            fb_score = metrics.fbeta_score(inner_validation_set_y, preds, beta=2)
            recall = metrics.recall_score(inner_validation_set_y, preds)
            precision = metrics.precision_score(inner_validation_set_y, preds)
            inner_cv_fb_score.append(fb_score)
            inner_cv_recall.append(recall)
            inner_cv_precision.append(precision)

        # get the average score
        fb_score_avg = np.mean(inner_cv_fb_score)
        recall_score_avg = np.mean(inner_cv_recall)
        precision_score_avg = np.mean(inner_cv_precision)

        # now log the average scores against the hyperparameters
        grid_results.append({
            'C': g['C'],
            'weight': g['weight'],
            'threshold': g['threshold'],
            'cv_fb_score': fb_score_avg,
            'cv_recall_score': recall_score_avg,
            'cv_precision_score': precision_score_avg
        })

    # select the best model
    best_model = max([x for x in grid_results], key=lambda item: item['cv_fb_score'])

    # fit the classifier
    clf = lm.LogisticRegression(C=best_model['C'], class_weight=best_model['weight'], random_state=123)
    clf = Pipeline([
                   ('scaler', preprocessing.StandardScaler()),
                   ('model', clf)
    ])
    clf.fit(outer_train_set_X, outer_train_set_y)

    # score the test set
    predicted_prob = clf.predict_proba(outer_test_set_X)[:, 1]
    preds = (predicted_prob >= best_model['threshold']).astype('int')
    fb_score = metrics.fbeta_score(outer_test_set_y, preds, beta=2)
    recall = metrics.recall_score(outer_test_set_y, preds)
    precision = metrics.precision_score(outer_test_set_y, preds)
    cv_scores.append({
        'C': best_model['C'],
        'weight': best_model['weight'],
        'threshold': best_model['threshold'],
        'fb_score': fb_score,
        'recall_score': recall,
        'precision_score': precision
    })

# finally average all the outer CV fb_score scores to get the cv estimate
estimated_fb_score = np.mean([x['fb_score'] for x in cv_scores])
estimated_recall_score = np.mean([x['recall_score'] for x in cv_scores])
estimated_precision_score = np.mean([x['precision_score'] for x in cv_scores])

print 'Overall estimated fb_score =', estimated_fb_score
print 'Overall estimated recall_score =', estimated_recall_score
print 'Overall estimated precision_score =', estimated_precision_score

# this is the best model
best_model

# create DataFrame with the grid results
results = pd.DataFrame(grid_results)

# visualise the efect of the hyperparameters with a heatmap
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(14,5))

sns.heatmap(results.loc[results.weight != 'balanced', :]\
            .pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
            annot=True, ax=ax0)

ax0.set(title='class_weight=None')

sns.heatmap(results.loc[results.weight == 'balanced', :]\
            .pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
            annot=True, ax=ax1)

ax1.set(title='class_weight=balanced')
	# convert X and y to numpy arrays
	X = X.as_matrix()
	y = y.as_matrix()

	# create stratified k-fold split generators for inner and outer loops
	outer_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
	inner_kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)

	# set up hyperparameter tuning
	Cs = 10.0 ** np.arange(-4,3)
	weights = [None, 'balanced']
	thresholds = np.linspace(0.3, 0.7, 9)

	grid = []
	for c in Cs:
	for w in weights:
	for t in thresholds:
	grid.append({
	'C': c,
	'weight': w,
	'threshold': t
	})

	# run hyperparameter tuning inside an outer cross-validation loop
	cv_scores = []

	# start the outer loop
	for outer_train, outer_test in outer_kf.split(X, y):

	# apply indices
	outer_train_set_X = X[outer_train, :]
	outer_test_set_X = X[outer_test, :]
	outer_train_set_y = y[outer_train]
	outer_test_set_y = y[outer_test]

	# start the grid loop
	grid_results = [] # list to store the results for all combinations of hyperparameters
	for g in grid:

	# start the inner loop
	inner_cv_fb_score = [] # list to store each fb_score result for the inner cv loop
	inner_cv_recall = []
	inner_cv_precision = []
	for inner_train, validation in inner_kf.split(outer_train_set_X, outer_train_set_y):

	# apply indices
	inner_train_set_X = outer_train_set_X[inner_train, :]
	inner_validation_set_X = outer_train_set_X[validation, :]
	inner_train_set_y = outer_train_set_y[inner_train]
	inner_validation_set_y = outer_train_set_y[validation]

	# fit the classifier based on the C and weight hyper parameters
	clf = lm.LogisticRegression(C=g['C'], class_weight=g['weight'], random_state=123)
	clf = Pipeline([
	('scaler', preprocessing.StandardScaler()),
	('model', clf)
	])
	clf.fit(inner_train_set_X, inner_train_set_y)

	# score the validation set
	predicted_prob = clf.predict_proba(inner_validation_set_X)[:, 1]
	preds = (predicted_prob >= g['threshold']).astype('int')
	fb_score = metrics.fbeta_score(inner_validation_set_y, preds, beta=2)
	recall = metrics.recall_score(inner_validation_set_y, preds)
	precision = metrics.precision_score(inner_validation_set_y, preds)
	inner_cv_fb_score.append(fb_score)
	inner_cv_recall.append(recall)
	inner_cv_precision.append(precision)

	# get the average score
	fb_score_avg = np.mean(inner_cv_fb_score)
	recall_score_avg = np.mean(inner_cv_recall)
	precision_score_avg = np.mean(inner_cv_precision)

	# now log the average scores against the hyperparameters
	grid_results.append({
	'C': g['C'],
	'weight': g['weight'],
	'threshold': g['threshold'],
	'cv_fb_score': fb_score_avg,
	'cv_recall_score': recall_score_avg,
	'cv_precision_score': precision_score_avg
	})

	# select the best model
	best_model = max([x for x in grid_results], key=lambda item: item['cv_fb_score'])

	# fit the classifier
	clf = lm.LogisticRegression(C=best_model['C'], class_weight=best_model['weight'], random_state=123)
	clf = Pipeline([
	('scaler', preprocessing.StandardScaler()),
	('model', clf)
	])
	clf.fit(outer_train_set_X, outer_train_set_y)

	# score the test set
	predicted_prob = clf.predict_proba(outer_test_set_X)[:, 1]
	preds = (predicted_prob >= best_model['threshold']).astype('int')
	fb_score = metrics.fbeta_score(outer_test_set_y, preds, beta=2)
	recall = metrics.recall_score(outer_test_set_y, preds)
	precision = metrics.precision_score(outer_test_set_y, preds)
	cv_scores.append({
	'C': best_model['C'],
	'weight': best_model['weight'],
	'threshold': best_model['threshold'],
	'fb_score': fb_score,
	'recall_score': recall,
	'precision_score': precision
	})

	# finally average all the outer CV fb_score scores to get the cv estimate
	estimated_fb_score = np.mean([x['fb_score'] for x in cv_scores])
	estimated_recall_score = np.mean([x['recall_score'] for x in cv_scores])
	estimated_precision_score = np.mean([x['precision_score'] for x in cv_scores])

	print 'Overall estimated fb_score =', estimated_fb_score
	print 'Overall estimated recall_score =', estimated_recall_score
	print 'Overall estimated precision_score =', estimated_precision_score

	# this is the best model
	best_model

	# create DataFrame with the grid results
	results = pd.DataFrame(grid_results)

	# visualise the efect of the hyperparameters with a heatmap
	fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(14,5))

	sns.heatmap(results.loc[results.weight != 'balanced', :]\
	.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
	annot=True, ax=ax0)

	ax0.set(title='class_weight=None')

	sns.heatmap(results.loc[results.weight == 'balanced', :]\
	.pivot_table(index='C', columns='threshold', values='cv_fb_score', aggfunc='mean'),
	annot=True, ax=ax1)

	ax1.set(title='class_weight=balanced')