Skip to content

Instantly share code, notes, and snippets.

@Micah0808
Last active October 30, 2020 05:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Micah0808/6d9e4d0919c9f43dcb3e53d21f405c97 to your computer and use it in GitHub Desktop.
Save Micah0808/6d9e4d0919c9f43dcb3e53d21f405c97 to your computer and use it in GitHub Desktop.
Demonstration of code based data leakage for Translational Psychiatry.
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
"""
Translational Psychiatry: Data leakage demo
"""
__author__ = 'Micah Cearns'
__contact__ = 'micahcearns@gmail.com'
__date__ = 'April 2019'
# Data frame and arrays
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
# Pre-processing
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
# Cross validation and hyperparameter optimisation
from sklearn.model_selection import cross_validate
# Model metrics
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import (precision_score,
f1_score,
recall_score)
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
# Misc
from tempfile import mkdtemp
from shutil import rmtree
def scoring_dict(y):
"""
Setting up a multi-metric scoring dictionary for cross-validation
:return: scoring - Scoring dictionary for cross-validate
"""
scoring = {'AUC': 'roc_auc',
'Accuracy': 'accuracy',
'Balanced_accuracy': make_scorer(recall_score,
pos_label = None,
average = 'macro',
sample_weight = None),
'Sensitivity': make_scorer(recall_score,
pos_label = 1,
average = 'binary',
sample_weight = None,
labels = np.unique(y)),
'Specificity': make_scorer(recall_score,
pos_label = 0,
average = 'binary',
sample_weight = None,
labels = np.unique(y)),
'F1': make_scorer(f1_score,
average = 'weighted',
labels = np.unique(y)),
'PPV': make_scorer(precision_score,
pos_label = 1,
average = 'binary',
labels = np.unique(y)),
'NPV': make_scorer(precision_score,
pos_label = 0,
average = 'binary',
labels = np.unique(y))}
return scoring
def train_test_scores(estimator_scores):
"""
Scoring function for cross-validated estimator results
:param estimator_scores: Train and test scores from pipeline estimator
:return: train_results - Train partition results
test_results - Test partition results
"""
# Converting the dictionary of scores from cross_validate to a dataframe
# and dropping unnecessary rows
scores_df = (pd
.DataFrame
.from_dict(estimator_scores)
.drop(['fit_time', 'score_time'], axis = 1))
# Getting mean scores and standard deviations from repeated cv
scores_mean = scores_df.mean() * 100
scores_std = scores_df.std() * 100
# Returning results as pandas dataframe
results = np.round(
pd.DataFrame({'Accuracy': scores_mean,
'Standard Deviation': scores_std}),
decimals = 2)
# Sub-setting train and test results into their own data frames
test_results = results.iloc[list(range(0, 16, 2))]
train_results = results.iloc[list(range(1, 17, 2))]
return train_results, test_results, scores_df
def main():
"""
:return: Implementation of functions for data leakage example
"""
# ==============================================================
# Leaking data through multiple transformations outside cv folds
# ==============================================================
# Setup
print('Running main()')
rand_state = 10
# Generating noise features
X = pd.DataFrame(data = np.random.randn(500, 3000))
# Getting a y outcome, do not need the feature matrix assigned to G
G, y = make_classification(n_samples=500,
n_features=3000,
n_informative=10,
n_redundant=2990)
# Setting up X matrix and y vector to leak into
X_leak = X
y_leak = y
# Defining scaler for standardization
scaler = StandardScaler(with_mean=True, with_std=True)
# Fitting, transforming, and retaining the feature names
X_leak[X_leak.columns] = scaler.fit_transform(X_leak[X_leak.columns])
# Logistic regression with l1 regularization / lasso for feature selection
lr = LogisticRegression(penalty='l1', C=0.4, solver='liblinear')
lr.fit(X=X_leak, y=y_leak)
# Selecting vars from Lasso after regularization
selector = SelectFromModel(estimator=lr, prefit=True)
# Transforming the feature space
X_leak = pd.DataFrame(data=selector.transform(X_leak))
# And now training an Linear SVM
clf = LinearSVC(random_state=rand_state, max_iter=5000)
# Returning the scoring dictionary
scorer = scoring_dict(y=y_leak)
# Here I am running a linear support vector machine with default parameters
# on the selected features with 10-fold cross-validation
leaked_scores = cross_validate(estimator=clf,
X=X_leak,
y=y_leak,
scoring=scorer,
cv=10,
return_train_score=True)
# Getting the final leaked scores
(leaked_train_results,
leaked_test_results,
leaked_scores_df) = train_test_scores(estimator_scores = leaked_scores)
# ====================================================================
# Now using a pipeline and completing transformation in the same folds
# ====================================================================
# Setting up new selector with no pre-fitting
selector = SelectFromModel(estimator = lr)
# Implementing pipeline to avoid leakage in transformations
pipe_params = [('scale', scaler),
('select', selector),
('clf', clf)]
# Cache directory to avoid repeat computation
cachedir = mkdtemp()
pipe = Pipeline(steps=pipe_params, memory=cachedir)
# Re-running transformations and linear SVM inside sklearn pipeline
scores = cross_validate(estimator=pipe,
X=X,
y=y,
scoring=scorer,
cv=10
return_train_score=True)
# Getting the final non-leaked scores
(pipe_train_results,
pipe_test_results,
pipe_scores_df) = train_test_scores(estimator_scores=scores)
# Clearing temp directory
rmtree(cachedir)
# Results with pipeline architecture
output = (leaked_test_results, '',
pipe_test_results)
return print(*output, sep = '\n')
if __name__ == '__main__':
print(main())
# These results will bounce around a bit as new random data sets are being
# created each time, however, the general effects of leakage are consistently
# displayed
# =========================================================================
# Results with leakage and transformations outside of pipeline in the first
# section of the main function.
# =========================================================================
# Accuracy Standard Deviation
# test_AUC 99.89 0.30
# test_Accuracy 99.40 0.96
# test_Balanced_accuracy 99.41 0.95
# test_Sensitivity 99.60 1.26
# test_Specificity 99.22 1.65
# test_F1 99.40 0.96
# test_PPV 99.23 1.62
# test_NPV 99.62 1.22
# ============================================================================
# Results with transformations done in the same CV folds through the use of a
# pipeline. No better than chance.
# ============================================================================
# Accuracy Standard Deviation
# test_AUC 50.23 4.45
# test_Accuracy 49.96 5.93
# test_Balanced_accuracy 49.95 5.95
# test_Sensitivity 45.92 9.95
# test_Specificity 53.98 9.71
# test_F1 49.59 6.06
# test_PPV 49.48 6.24
# test_NPV 50.34 5.96
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment