Micah0808/leakage_demo_trans.py Secret

## leakage_demo_trans.py
#!/usr/bin/env python
# _*_ coding: utf-8 _*_

"""

Translational Psychiatry: Data leakage demo

"""

__author__ = 'Micah Cearns'
__contact__ = 'micahcearns@gmail.com'
__date__ = 'April 2019'

# Data frame and arrays
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Pre-processing
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

# Cross validation and hyperparameter optimisation
from sklearn.model_selection import cross_validate

# Model metrics
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import (precision_score,
                             f1_score,
                             recall_score)

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Misc
from tempfile import mkdtemp
from shutil import rmtree


def scoring_dict(y):
    """

    Setting up a multi-metric scoring dictionary for cross-validation

    :return: scoring - Scoring dictionary for cross-validate

    """

    scoring = {'AUC': 'roc_auc',
               'Accuracy': 'accuracy',
               'Balanced_accuracy': make_scorer(recall_score,
                                                pos_label = None,
                                                average = 'macro',
                                                sample_weight = None),

               'Sensitivity': make_scorer(recall_score,
                                          pos_label = 1,
                                          average = 'binary',
                                          sample_weight = None,
                                          labels = np.unique(y)),

               'Specificity': make_scorer(recall_score,
                                          pos_label = 0,
                                          average = 'binary',
                                          sample_weight = None,
                                          labels = np.unique(y)),

               'F1': make_scorer(f1_score,
                                 average = 'weighted',
                                 labels = np.unique(y)),

               'PPV': make_scorer(precision_score,
                                  pos_label = 1,
                                  average = 'binary',
                                  labels = np.unique(y)),

               'NPV': make_scorer(precision_score,
                                  pos_label = 0,
                                  average = 'binary',
                                  labels = np.unique(y))}

    return scoring


def train_test_scores(estimator_scores):
    """
    Scoring function for cross-validated estimator results

    :param estimator_scores: Train and test scores from pipeline estimator

    :return: train_results - Train partition results
             test_results - Test partition results
    """

    # Converting the dictionary of scores from cross_validate to a dataframe
    # and dropping unnecessary rows
    scores_df = (pd
                 .DataFrame
                 .from_dict(estimator_scores)
                 .drop(['fit_time', 'score_time'], axis = 1))
    # Getting mean scores and standard deviations from repeated cv
    scores_mean = scores_df.mean() * 100
    scores_std = scores_df.std() * 100
    # Returning results as pandas dataframe
    results = np.round(
        pd.DataFrame({'Accuracy': scores_mean,
                      'Standard Deviation': scores_std}),
        decimals = 2)
    # Sub-setting train and test results into their own data frames
    test_results = results.iloc[list(range(0, 16, 2))]
    train_results = results.iloc[list(range(1, 17, 2))]

    return train_results, test_results, scores_df


def main():
    """

    :return: Implementation of functions for data leakage example

    """
    # ==============================================================
    # Leaking data through multiple transformations outside cv folds
    # ==============================================================
    # Setup
    print('Running main()')
    rand_state = 10
    # Generating noise features
    X = pd.DataFrame(data = np.random.randn(500, 3000))

    # Getting a y outcome, do not need the feature matrix assigned to G
    G, y = make_classification(n_samples=500,
                               n_features=3000,
                               n_informative=10,
                               n_redundant=2990)

    # Setting up X matrix and y vector to leak into
    X_leak = X
    y_leak = y

    # Defining scaler for standardization
    scaler = StandardScaler(with_mean=True, with_std=True)
    # Fitting, transforming, and retaining the feature names
    X_leak[X_leak.columns] = scaler.fit_transform(X_leak[X_leak.columns])
    # Logistic regression with l1 regularization / lasso for feature selection
    lr = LogisticRegression(penalty='l1', C=0.4, solver='liblinear')
    lr.fit(X=X_leak, y=y_leak)

    # Selecting vars from Lasso after regularization
    selector = SelectFromModel(estimator=lr, prefit=True)
    # Transforming the feature space
    X_leak = pd.DataFrame(data=selector.transform(X_leak))
    # And now training an Linear SVM
    clf = LinearSVC(random_state=rand_state, max_iter=5000)
    # Returning the scoring dictionary
    scorer = scoring_dict(y=y_leak)
    # Here I am running a linear support vector machine with default parameters
    # on the selected features with 10-fold cross-validation
    leaked_scores = cross_validate(estimator=clf,
                                   X=X_leak,
                                   y=y_leak,
                                   scoring=scorer,
                                   cv=10,
                                   return_train_score=True)

    # Getting the final leaked scores
    (leaked_train_results,
     leaked_test_results,
     leaked_scores_df) = train_test_scores(estimator_scores = leaked_scores)

    # ====================================================================
    # Now using a pipeline and completing transformation in the same folds
    # ====================================================================
    # Setting up new selector with no pre-fitting
    selector = SelectFromModel(estimator = lr)
    # Implementing pipeline to avoid leakage in transformations
    pipe_params = [('scale', scaler),
                   ('select', selector),
                   ('clf', clf)]
    # Cache directory to avoid repeat computation
    cachedir = mkdtemp()
    pipe = Pipeline(steps=pipe_params, memory=cachedir)
    # Re-running transformations and linear SVM inside sklearn pipeline
    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y,
                            scoring=scorer,
                            cv=10
                            return_train_score=True)
    # Getting the final non-leaked scores
    (pipe_train_results,
     pipe_test_results,
     pipe_scores_df) = train_test_scores(estimator_scores=scores)
    # Clearing temp directory
    rmtree(cachedir)
    # Results with pipeline architecture
    output = (leaked_test_results, '',
              pipe_test_results)

    return print(*output, sep = '\n')


if __name__ == '__main__':
    print(main())

# These results will bounce around a bit as new random data sets are being
# created each time, however, the general effects of leakage are consistently
# displayed

# =========================================================================
# Results with leakage and transformations outside of pipeline in the first
# section of the main function.
# =========================================================================

#                         Accuracy  Standard Deviation
# test_AUC                   99.89                0.30
# test_Accuracy              99.40                0.96
# test_Balanced_accuracy     99.41                0.95
# test_Sensitivity           99.60                1.26
# test_Specificity           99.22                1.65
# test_F1                    99.40                0.96
# test_PPV                   99.23                1.62
# test_NPV                   99.62                1.22

# ============================================================================
# Results with transformations done in the same CV folds through the use of a
# pipeline. No better than chance.
# ============================================================================

#                         Accuracy  Standard Deviation
# test_AUC                   50.23                4.45
# test_Accuracy              49.96                5.93
# test_Balanced_accuracy     49.95                5.95
# test_Sensitivity           45.92                9.95
# test_Specificity           53.98                9.71
# test_F1                    49.59                6.06
# test_PPV                   49.48                6.24
# test_NPV                   50.34                5.96
	#!/usr/bin/env python
	# __ coding: utf-8 __

	"""

	Translational Psychiatry: Data leakage demo

	"""

	__author__ = 'Micah Cearns'
	__contact__ = 'micahcearns@gmail.com'
	__date__ = 'April 2019'

	# Data frame and arrays
	import numpy as np
	import pandas as pd
	from sklearn.datasets import make_classification

	# Pre-processing
	from sklearn.feature_selection import SelectFromModel
	from sklearn.preprocessing import StandardScaler

	# Cross validation and hyperparameter optimisation
	from sklearn.model_selection import cross_validate

	# Model metrics
	from sklearn.metrics.scorer import make_scorer
	from sklearn.metrics import (precision_score,
	f1_score,
	recall_score)

	# Classifiers
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import LinearSVC
	from sklearn.pipeline import Pipeline

	# Misc
	from tempfile import mkdtemp
	from shutil import rmtree


	def scoring_dict(y):
	"""

	Setting up a multi-metric scoring dictionary for cross-validation

	:return: scoring - Scoring dictionary for cross-validate

	"""

	scoring = {'AUC': 'roc_auc',
	'Accuracy': 'accuracy',
	'Balanced_accuracy': make_scorer(recall_score,
	pos_label = None,
	average = 'macro',
	sample_weight = None),

	'Sensitivity': make_scorer(recall_score,
	pos_label = 1,
	average = 'binary',
	sample_weight = None,
	labels = np.unique(y)),

	'Specificity': make_scorer(recall_score,
	pos_label = 0,
	average = 'binary',
	sample_weight = None,
	labels = np.unique(y)),

	'F1': make_scorer(f1_score,
	average = 'weighted',
	labels = np.unique(y)),

	'PPV': make_scorer(precision_score,
	pos_label = 1,
	average = 'binary',
	labels = np.unique(y)),

	'NPV': make_scorer(precision_score,
	pos_label = 0,
	average = 'binary',
	labels = np.unique(y))}

	return scoring


	def train_test_scores(estimator_scores):
	"""
	Scoring function for cross-validated estimator results

	:param estimator_scores: Train and test scores from pipeline estimator

	:return: train_results - Train partition results
	test_results - Test partition results
	"""

	# Converting the dictionary of scores from cross_validate to a dataframe
	# and dropping unnecessary rows
	scores_df = (pd
	.DataFrame
	.from_dict(estimator_scores)
	.drop(['fit_time', 'score_time'], axis = 1))
	# Getting mean scores and standard deviations from repeated cv
	scores_mean = scores_df.mean() * 100
	scores_std = scores_df.std() * 100
	# Returning results as pandas dataframe
	results = np.round(
	pd.DataFrame({'Accuracy': scores_mean,
	'Standard Deviation': scores_std}),
	decimals = 2)
	# Sub-setting train and test results into their own data frames
	test_results = results.iloc[list(range(0, 16, 2))]
	train_results = results.iloc[list(range(1, 17, 2))]

	return train_results, test_results, scores_df


	def main():
	"""

	:return: Implementation of functions for data leakage example

	"""
	# ==============================================================
	# Leaking data through multiple transformations outside cv folds
	# ==============================================================
	# Setup
	print('Running main()')
	rand_state = 10
	# Generating noise features
	X = pd.DataFrame(data = np.random.randn(500, 3000))

	# Getting a y outcome, do not need the feature matrix assigned to G
	G, y = make_classification(n_samples=500,
	n_features=3000,
	n_informative=10,
	n_redundant=2990)

	# Setting up X matrix and y vector to leak into
	X_leak = X
	y_leak = y

	# Defining scaler for standardization
	scaler = StandardScaler(with_mean=True, with_std=True)
	# Fitting, transforming, and retaining the feature names
	X_leak[X_leak.columns] = scaler.fit_transform(X_leak[X_leak.columns])
	# Logistic regression with l1 regularization / lasso for feature selection
	lr = LogisticRegression(penalty='l1', C=0.4, solver='liblinear')
	lr.fit(X=X_leak, y=y_leak)

	# Selecting vars from Lasso after regularization
	selector = SelectFromModel(estimator=lr, prefit=True)
	# Transforming the feature space
	X_leak = pd.DataFrame(data=selector.transform(X_leak))
	# And now training an Linear SVM
	clf = LinearSVC(random_state=rand_state, max_iter=5000)
	# Returning the scoring dictionary
	scorer = scoring_dict(y=y_leak)
	# Here I am running a linear support vector machine with default parameters
	# on the selected features with 10-fold cross-validation
	leaked_scores = cross_validate(estimator=clf,
	X=X_leak,
	y=y_leak,
	scoring=scorer,
	cv=10,
	return_train_score=True)

	# Getting the final leaked scores
	(leaked_train_results,
	leaked_test_results,
	leaked_scores_df) = train_test_scores(estimator_scores = leaked_scores)

	# ====================================================================
	# Now using a pipeline and completing transformation in the same folds
	# ====================================================================
	# Setting up new selector with no pre-fitting
	selector = SelectFromModel(estimator = lr)
	# Implementing pipeline to avoid leakage in transformations
	pipe_params = [('scale', scaler),
	('select', selector),
	('clf', clf)]
	# Cache directory to avoid repeat computation
	cachedir = mkdtemp()
	pipe = Pipeline(steps=pipe_params, memory=cachedir)
	# Re-running transformations and linear SVM inside sklearn pipeline
	scores = cross_validate(estimator=pipe,
	X=X,
	y=y,
	scoring=scorer,
	cv=10
	return_train_score=True)
	# Getting the final non-leaked scores
	(pipe_train_results,
	pipe_test_results,
	pipe_scores_df) = train_test_scores(estimator_scores=scores)
	# Clearing temp directory
	rmtree(cachedir)
	# Results with pipeline architecture
	output = (leaked_test_results, '',
	pipe_test_results)

	return print(*output, sep = '\n')


	if __name__ == '__main__':
	print(main())

	# These results will bounce around a bit as new random data sets are being
	# created each time, however, the general effects of leakage are consistently
	# displayed

	# =========================================================================
	# Results with leakage and transformations outside of pipeline in the first
	# section of the main function.
	# =========================================================================

	# Accuracy Standard Deviation
	# test_AUC 99.89 0.30
	# test_Accuracy 99.40 0.96
	# test_Balanced_accuracy 99.41 0.95
	# test_Sensitivity 99.60 1.26
	# test_Specificity 99.22 1.65
	# test_F1 99.40 0.96
	# test_PPV 99.23 1.62
	# test_NPV 99.62 1.22

	# ============================================================================
	# Results with transformations done in the same CV folds through the use of a
	# pipeline. No better than chance.
	# ============================================================================

	# Accuracy Standard Deviation
	# test_AUC 50.23 4.45
	# test_Accuracy 49.96 5.93
	# test_Balanced_accuracy 49.95 5.95
	# test_Sensitivity 45.92 9.95
	# test_Specificity 53.98 9.71
	# test_F1 49.59 6.06
	# test_PPV 49.48 6.24
	# test_NPV 50.34 5.96