djarecka/pydra_shap_feature_analysis.py

## pydra_shap_feature_analysis.py
#!/usr/bin/env python
# coding: utf-8

# ## Prepare Data

# In[1]:


import pandas as pd
import numpy as np
import pydra
import typing as ty
import os


cg = pd.read_csv('VFPNormals_Concatenated.csv')
cgX = cg.iloc[:, 2:-2].values.astype(np.float)
cgY = np.zeros(cgX.shape[0])
cggroups = cg['sid'].values
print(cgX.shape, cgY.shape, cggroups.shape)

pg = pd.read_csv('VFP_Concatenated.csv')
pgX = pg.iloc[:, 2:-2].values.astype(np.float)
pgY = np.ones(pgX.shape[0])
pggroups = pg['sid'].values
print(pgX.shape, pgY.shape, pggroups.shape)


X = np.vstack((cgX, pgX))
y = np.concatenate((cgY,pgY))
groups = np.concatenate((cggroups, pggroups))
#print(np.unique(groups))


# Collect feature names
feature_names = list(cg.iloc[:, 2:-2].keys())
feature_names[:5]

from warnings import simplefilter
simplefilter(action='ignore', category=Warning)

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
train_test_splits = list(gss.split(X, y, groups=groups))

@pydra.mark.task
@pydra.mark.annotate({"return": {"auc": ty.Any, "shaps": ty.Any}})
def train_test(X, y, groups, train_test_splits, clf, permute, train_test_index):
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import SelectFromModel
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.naive_bayes import GaussianNB

    rbf_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}]
    linear_parameters = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    knearest_params = [{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights': ['uniform','distance']}]

    classifiers = dict([
        ('ExtraTrees',
         Pipeline([('select', SelectFromModel(ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))),
                   ('clf', ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))])),
        ('Random Forest', RandomForestClassifier(n_estimators=100)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
        ('RBF SVM', GridSearchCV(SVC(probability=True), param_grid=rbf_parameters)),
        ('Naive Bayes', GaussianNB()),
        ('Neural Network', MLPClassifier(alpha=1, max_iter=1000)),
        ('Nearest Neighbors', GridSearchCV(KNeighborsClassifier(), param_grid=knearest_params)),
        ('Linear SVM', GridSearchCV(SVC(probability=True), param_grid=linear_parameters)),
        ('AdaBoost', AdaBoostClassifier()),
        ('Logistic Regression', LogisticRegressionCV(solver = 'liblinear', penalty='l1')) ])

    train_index, test_index = train_test_splits[train_test_index]
    pipe = Pipeline([('std', StandardScaler()) , (clf, classifiers[clf])])
    shaps = None
    if permute:
        pipe.fit(X[train_index], y[np.random.permutation(train_index)])
    else:
        pipe.fit(X[train_index], y[train_index])
        import shap
        if 'SVM' in clf:
            explainer = shap.KernelExplainer(pipe.predict_proba, X[train_index])
        else:
            explainer = shap.KernelExplainer(pipe.predict, X[train_index])
        shaps = explainer.shap_values(X[test_index], nsamples=100)

    auc = roc_auc_score(y[test_index], pipe.predict(X[test_index]))
    return auc, shaps

names = ['ExtraTrees', 'Random Forest', 'Decision Tree', 'Naive Bayes',
         'Neural Network', 'AdaBoost', 'Logistic Regression']
# names = ['RBF SVM', 'Linear SVM', 'Nearest Neighbors']

clf_task = train_test(clf=names, X=X, y=y, groups=groups,
                      permute=[True, False],
                      train_test_splits=train_test_splits,
                      train_test_index=list(range(100)),
                      cache_dir=os.path.join(os.getcwd(),
                                             'cache'))
clf_task.split(['clf', 'permute', 'train_test_index'])
clf_task.combine(['permute', 'train_test_index'])

# print(clf_task.output_dir)

print('Submitting')
with pydra.Submitter(plugin="cf", n_procs=int(int(os.environ['SLURM_CPUS_PER_TASK'])*0.75)) as sub:
    sub(runnable=clf_task)

import pickle as pk
with open('clf-task-results.pkl', 'wb') as fp:
    pk.dump((names, [True, False], 100, feature_names, clf_task.result()), fp)
	#!/usr/bin/env python
	# coding: utf-8

	# ## Prepare Data

	# In[1]:


	import pandas as pd
	import numpy as np
	import pydra
	import typing as ty
	import os


	cg = pd.read_csv('VFPNormals_Concatenated.csv')
	cgX = cg.iloc[:, 2:-2].values.astype(np.float)
	cgY = np.zeros(cgX.shape[0])
	cggroups = cg['sid'].values
	print(cgX.shape, cgY.shape, cggroups.shape)

	pg = pd.read_csv('VFP_Concatenated.csv')
	pgX = pg.iloc[:, 2:-2].values.astype(np.float)
	pgY = np.ones(pgX.shape[0])
	pggroups = pg['sid'].values
	print(pgX.shape, pgY.shape, pggroups.shape)


	X = np.vstack((cgX, pgX))
	y = np.concatenate((cgY,pgY))
	groups = np.concatenate((cggroups, pggroups))
	#print(np.unique(groups))


	# Collect feature names
	feature_names = list(cg.iloc[:, 2:-2].keys())
	feature_names[:5]

	from warnings import simplefilter
	simplefilter(action='ignore', category=Warning)

	from sklearn.model_selection import GroupShuffleSplit
	gss = GroupShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
	train_test_splits = list(gss.split(X, y, groups=groups))

	@pydra.mark.task
	@pydra.mark.annotate({"return": {"auc": ty.Any, "shaps": ty.Any}})
	def train_test(X, y, groups, train_test_splits, clf, permute, train_test_index):
	from sklearn.preprocessing import StandardScaler
	from sklearn.feature_selection import SelectFromModel
	from sklearn.metrics import roc_auc_score
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.svm import SVC
	from sklearn.linear_model import LogisticRegressionCV
	from sklearn.neural_network import MLPClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.model_selection import GridSearchCV
	from sklearn.naive_bayes import GaussianNB

	rbf_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}]
	linear_parameters = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
	knearest_params = [{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights': ['uniform','distance']}]

	classifiers = dict([
	('ExtraTrees',
	Pipeline([('select', SelectFromModel(ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))),
	('clf', ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))])),
	('Random Forest', RandomForestClassifier(n_estimators=100)),
	('Decision Tree', DecisionTreeClassifier(max_depth=5)),
	('RBF SVM', GridSearchCV(SVC(probability=True), param_grid=rbf_parameters)),
	('Naive Bayes', GaussianNB()),
	('Neural Network', MLPClassifier(alpha=1, max_iter=1000)),
	('Nearest Neighbors', GridSearchCV(KNeighborsClassifier(), param_grid=knearest_params)),
	('Linear SVM', GridSearchCV(SVC(probability=True), param_grid=linear_parameters)),
	('AdaBoost', AdaBoostClassifier()),
	('Logistic Regression', LogisticRegressionCV(solver = 'liblinear', penalty='l1')) ])

	train_index, test_index = train_test_splits[train_test_index]
	pipe = Pipeline([('std', StandardScaler()) , (clf, classifiers[clf])])
	shaps = None
	if permute:
	pipe.fit(X[train_index], y[np.random.permutation(train_index)])
	else:
	pipe.fit(X[train_index], y[train_index])
	import shap
	if 'SVM' in clf:
	explainer = shap.KernelExplainer(pipe.predict_proba, X[train_index])
	else:
	explainer = shap.KernelExplainer(pipe.predict, X[train_index])
	shaps = explainer.shap_values(X[test_index], nsamples=100)

	auc = roc_auc_score(y[test_index], pipe.predict(X[test_index]))
	return auc, shaps

	names = ['ExtraTrees', 'Random Forest', 'Decision Tree', 'Naive Bayes',
	'Neural Network', 'AdaBoost', 'Logistic Regression']
	# names = ['RBF SVM', 'Linear SVM', 'Nearest Neighbors']

	clf_task = train_test(clf=names, X=X, y=y, groups=groups,
	permute=[True, False],
	train_test_splits=train_test_splits,
	train_test_index=list(range(100)),
	cache_dir=os.path.join(os.getcwd(),
	'cache'))
	clf_task.split(['clf', 'permute', 'train_test_index'])
	clf_task.combine(['permute', 'train_test_index'])

	# print(clf_task.output_dir)

	print('Submitting')
	with pydra.Submitter(plugin="cf", n_procs=int(int(os.environ['SLURM_CPUS_PER_TASK'])*0.75)) as sub:
	sub(runnable=clf_task)

	import pickle as pk
	with open('clf-task-results.pkl', 'wb') as fp:
	pk.dump((names, [True, False], 100, feature_names, clf_task.result()), fp)