Created
April 29, 2020 13:17
-
-
Save djarecka/2fac49c775d2a34a8af641e5e0928ded to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# ## Prepare Data | |
# In[1]: | |
import pandas as pd | |
import numpy as np | |
import pydra | |
import typing as ty | |
import os | |
cg = pd.read_csv('VFPNormals_Concatenated.csv') | |
cgX = cg.iloc[:, 2:-2].values.astype(np.float) | |
cgY = np.zeros(cgX.shape[0]) | |
cggroups = cg['sid'].values | |
print(cgX.shape, cgY.shape, cggroups.shape) | |
pg = pd.read_csv('VFP_Concatenated.csv') | |
pgX = pg.iloc[:, 2:-2].values.astype(np.float) | |
pgY = np.ones(pgX.shape[0]) | |
pggroups = pg['sid'].values | |
print(pgX.shape, pgY.shape, pggroups.shape) | |
X = np.vstack((cgX, pgX)) | |
y = np.concatenate((cgY,pgY)) | |
groups = np.concatenate((cggroups, pggroups)) | |
#print(np.unique(groups)) | |
# Collect feature names | |
feature_names = list(cg.iloc[:, 2:-2].keys()) | |
feature_names[:5] | |
from warnings import simplefilter | |
simplefilter(action='ignore', category=Warning) | |
from sklearn.model_selection import GroupShuffleSplit | |
gss = GroupShuffleSplit(n_splits=100, test_size=0.2, random_state=0) | |
train_test_splits = list(gss.split(X, y, groups=groups)) | |
@pydra.mark.task | |
@pydra.mark.annotate({"return": {"auc": ty.Any, "shaps": ty.Any}}) | |
def train_test(X, y, groups, train_test_splits, clf, permute, train_test_index): | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.feature_selection import SelectFromModel | |
from sklearn.metrics import roc_auc_score | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.pipeline import Pipeline | |
from sklearn.svm import SVC | |
from sklearn.linear_model import LogisticRegressionCV | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.naive_bayes import GaussianNB | |
rbf_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}] | |
linear_parameters = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] | |
knearest_params = [{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights': ['uniform','distance']}] | |
classifiers = dict([ | |
('ExtraTrees', | |
Pipeline([('select', SelectFromModel(ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))), | |
('clf', ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))])), | |
('Random Forest', RandomForestClassifier(n_estimators=100)), | |
('Decision Tree', DecisionTreeClassifier(max_depth=5)), | |
('RBF SVM', GridSearchCV(SVC(probability=True), param_grid=rbf_parameters)), | |
('Naive Bayes', GaussianNB()), | |
('Neural Network', MLPClassifier(alpha=1, max_iter=1000)), | |
('Nearest Neighbors', GridSearchCV(KNeighborsClassifier(), param_grid=knearest_params)), | |
('Linear SVM', GridSearchCV(SVC(probability=True), param_grid=linear_parameters)), | |
('AdaBoost', AdaBoostClassifier()), | |
('Logistic Regression', LogisticRegressionCV(solver = 'liblinear', penalty='l1')) ]) | |
train_index, test_index = train_test_splits[train_test_index] | |
pipe = Pipeline([('std', StandardScaler()) , (clf, classifiers[clf])]) | |
shaps = None | |
if permute: | |
pipe.fit(X[train_index], y[np.random.permutation(train_index)]) | |
else: | |
pipe.fit(X[train_index], y[train_index]) | |
import shap | |
if 'SVM' in clf: | |
explainer = shap.KernelExplainer(pipe.predict_proba, X[train_index]) | |
else: | |
explainer = shap.KernelExplainer(pipe.predict, X[train_index]) | |
shaps = explainer.shap_values(X[test_index], nsamples=100) | |
auc = roc_auc_score(y[test_index], pipe.predict(X[test_index])) | |
return auc, shaps | |
names = ['ExtraTrees', 'Random Forest', 'Decision Tree', 'Naive Bayes', | |
'Neural Network', 'AdaBoost', 'Logistic Regression'] | |
# names = ['RBF SVM', 'Linear SVM', 'Nearest Neighbors'] | |
clf_task = train_test(clf=names, X=X, y=y, groups=groups, | |
permute=[True, False], | |
train_test_splits=train_test_splits, | |
train_test_index=list(range(100)), | |
cache_dir=os.path.join(os.getcwd(), | |
'cache')) | |
clf_task.split(['clf', 'permute', 'train_test_index']) | |
clf_task.combine(['permute', 'train_test_index']) | |
# print(clf_task.output_dir) | |
print('Submitting') | |
with pydra.Submitter(plugin="cf", n_procs=int(int(os.environ['SLURM_CPUS_PER_TASK'])*0.75)) as sub: | |
sub(runnable=clf_task) | |
import pickle as pk | |
with open('clf-task-results.pkl', 'wb') as fp: | |
pk.dump((names, [True, False], 100, feature_names, clf_task.result()), fp) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment