-
-
Save Micah0808/6d9e4d0919c9f43dcb3e53d21f405c97 to your computer and use it in GitHub Desktop.
Demonstration of code based data leakage for Translational Psychiatry.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# _*_ coding: utf-8 _*_ | |
""" | |
Translational Psychiatry: Data leakage demo | |
""" | |
__author__ = 'Micah Cearns' | |
__contact__ = 'micahcearns@gmail.com' | |
__date__ = 'April 2019' | |
# Data frame and arrays | |
import numpy as np | |
import pandas as pd | |
from sklearn.datasets import make_classification | |
# Pre-processing | |
from sklearn.feature_selection import SelectFromModel | |
from sklearn.preprocessing import StandardScaler | |
# Cross validation and hyperparameter optimisation | |
from sklearn.model_selection import cross_validate | |
# Model metrics | |
from sklearn.metrics.scorer import make_scorer | |
from sklearn.metrics import (precision_score, | |
f1_score, | |
recall_score) | |
# Classifiers | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import LinearSVC | |
from sklearn.pipeline import Pipeline | |
# Misc | |
from tempfile import mkdtemp | |
from shutil import rmtree | |
def scoring_dict(y): | |
""" | |
Setting up a multi-metric scoring dictionary for cross-validation | |
:return: scoring - Scoring dictionary for cross-validate | |
""" | |
scoring = {'AUC': 'roc_auc', | |
'Accuracy': 'accuracy', | |
'Balanced_accuracy': make_scorer(recall_score, | |
pos_label = None, | |
average = 'macro', | |
sample_weight = None), | |
'Sensitivity': make_scorer(recall_score, | |
pos_label = 1, | |
average = 'binary', | |
sample_weight = None, | |
labels = np.unique(y)), | |
'Specificity': make_scorer(recall_score, | |
pos_label = 0, | |
average = 'binary', | |
sample_weight = None, | |
labels = np.unique(y)), | |
'F1': make_scorer(f1_score, | |
average = 'weighted', | |
labels = np.unique(y)), | |
'PPV': make_scorer(precision_score, | |
pos_label = 1, | |
average = 'binary', | |
labels = np.unique(y)), | |
'NPV': make_scorer(precision_score, | |
pos_label = 0, | |
average = 'binary', | |
labels = np.unique(y))} | |
return scoring | |
def train_test_scores(estimator_scores): | |
""" | |
Scoring function for cross-validated estimator results | |
:param estimator_scores: Train and test scores from pipeline estimator | |
:return: train_results - Train partition results | |
test_results - Test partition results | |
""" | |
# Converting the dictionary of scores from cross_validate to a dataframe | |
# and dropping unnecessary rows | |
scores_df = (pd | |
.DataFrame | |
.from_dict(estimator_scores) | |
.drop(['fit_time', 'score_time'], axis = 1)) | |
# Getting mean scores and standard deviations from repeated cv | |
scores_mean = scores_df.mean() * 100 | |
scores_std = scores_df.std() * 100 | |
# Returning results as pandas dataframe | |
results = np.round( | |
pd.DataFrame({'Accuracy': scores_mean, | |
'Standard Deviation': scores_std}), | |
decimals = 2) | |
# Sub-setting train and test results into their own data frames | |
test_results = results.iloc[list(range(0, 16, 2))] | |
train_results = results.iloc[list(range(1, 17, 2))] | |
return train_results, test_results, scores_df | |
def main(): | |
""" | |
:return: Implementation of functions for data leakage example | |
""" | |
# ============================================================== | |
# Leaking data through multiple transformations outside cv folds | |
# ============================================================== | |
# Setup | |
print('Running main()') | |
rand_state = 10 | |
# Generating noise features | |
X = pd.DataFrame(data = np.random.randn(500, 3000)) | |
# Getting a y outcome, do not need the feature matrix assigned to G | |
G, y = make_classification(n_samples=500, | |
n_features=3000, | |
n_informative=10, | |
n_redundant=2990) | |
# Setting up X matrix and y vector to leak into | |
X_leak = X | |
y_leak = y | |
# Defining scaler for standardization | |
scaler = StandardScaler(with_mean=True, with_std=True) | |
# Fitting, transforming, and retaining the feature names | |
X_leak[X_leak.columns] = scaler.fit_transform(X_leak[X_leak.columns]) | |
# Logistic regression with l1 regularization / lasso for feature selection | |
lr = LogisticRegression(penalty='l1', C=0.4, solver='liblinear') | |
lr.fit(X=X_leak, y=y_leak) | |
# Selecting vars from Lasso after regularization | |
selector = SelectFromModel(estimator=lr, prefit=True) | |
# Transforming the feature space | |
X_leak = pd.DataFrame(data=selector.transform(X_leak)) | |
# And now training an Linear SVM | |
clf = LinearSVC(random_state=rand_state, max_iter=5000) | |
# Returning the scoring dictionary | |
scorer = scoring_dict(y=y_leak) | |
# Here I am running a linear support vector machine with default parameters | |
# on the selected features with 10-fold cross-validation | |
leaked_scores = cross_validate(estimator=clf, | |
X=X_leak, | |
y=y_leak, | |
scoring=scorer, | |
cv=10, | |
return_train_score=True) | |
# Getting the final leaked scores | |
(leaked_train_results, | |
leaked_test_results, | |
leaked_scores_df) = train_test_scores(estimator_scores = leaked_scores) | |
# ==================================================================== | |
# Now using a pipeline and completing transformation in the same folds | |
# ==================================================================== | |
# Setting up new selector with no pre-fitting | |
selector = SelectFromModel(estimator = lr) | |
# Implementing pipeline to avoid leakage in transformations | |
pipe_params = [('scale', scaler), | |
('select', selector), | |
('clf', clf)] | |
# Cache directory to avoid repeat computation | |
cachedir = mkdtemp() | |
pipe = Pipeline(steps=pipe_params, memory=cachedir) | |
# Re-running transformations and linear SVM inside sklearn pipeline | |
scores = cross_validate(estimator=pipe, | |
X=X, | |
y=y, | |
scoring=scorer, | |
cv=10 | |
return_train_score=True) | |
# Getting the final non-leaked scores | |
(pipe_train_results, | |
pipe_test_results, | |
pipe_scores_df) = train_test_scores(estimator_scores=scores) | |
# Clearing temp directory | |
rmtree(cachedir) | |
# Results with pipeline architecture | |
output = (leaked_test_results, '', | |
pipe_test_results) | |
return print(*output, sep = '\n') | |
if __name__ == '__main__': | |
print(main()) | |
# These results will bounce around a bit as new random data sets are being | |
# created each time, however, the general effects of leakage are consistently | |
# displayed | |
# ========================================================================= | |
# Results with leakage and transformations outside of pipeline in the first | |
# section of the main function. | |
# ========================================================================= | |
# Accuracy Standard Deviation | |
# test_AUC 99.89 0.30 | |
# test_Accuracy 99.40 0.96 | |
# test_Balanced_accuracy 99.41 0.95 | |
# test_Sensitivity 99.60 1.26 | |
# test_Specificity 99.22 1.65 | |
# test_F1 99.40 0.96 | |
# test_PPV 99.23 1.62 | |
# test_NPV 99.62 1.22 | |
# ============================================================================ | |
# Results with transformations done in the same CV folds through the use of a | |
# pipeline. No better than chance. | |
# ============================================================================ | |
# Accuracy Standard Deviation | |
# test_AUC 50.23 4.45 | |
# test_Accuracy 49.96 5.93 | |
# test_Balanced_accuracy 49.95 5.95 | |
# test_Sensitivity 45.92 9.95 | |
# test_Specificity 53.98 9.71 | |
# test_F1 49.59 6.06 | |
# test_PPV 49.48 6.24 | |
# test_NPV 50.34 5.96 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment