Skip to content

Instantly share code, notes, and snippets.

@djarecka
Created April 29, 2020 13:17
Show Gist options
  • Save djarecka/2fac49c775d2a34a8af641e5e0928ded to your computer and use it in GitHub Desktop.
Save djarecka/2fac49c775d2a34a8af641e5e0928ded to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# ## Prepare Data
# In[1]:
import pandas as pd
import numpy as np
import pydra
import typing as ty
import os
cg = pd.read_csv('VFPNormals_Concatenated.csv')
cgX = cg.iloc[:, 2:-2].values.astype(np.float)
cgY = np.zeros(cgX.shape[0])
cggroups = cg['sid'].values
print(cgX.shape, cgY.shape, cggroups.shape)
pg = pd.read_csv('VFP_Concatenated.csv')
pgX = pg.iloc[:, 2:-2].values.astype(np.float)
pgY = np.ones(pgX.shape[0])
pggroups = pg['sid'].values
print(pgX.shape, pgY.shape, pggroups.shape)
X = np.vstack((cgX, pgX))
y = np.concatenate((cgY,pgY))
groups = np.concatenate((cggroups, pggroups))
#print(np.unique(groups))
# Collect feature names
feature_names = list(cg.iloc[:, 2:-2].keys())
feature_names[:5]
from warnings import simplefilter
simplefilter(action='ignore', category=Warning)
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
train_test_splits = list(gss.split(X, y, groups=groups))
@pydra.mark.task
@pydra.mark.annotate({"return": {"auc": ty.Any, "shaps": ty.Any}})
def train_test(X, y, groups, train_test_splits, clf, permute, train_test_index):
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
rbf_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}]
linear_parameters = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
knearest_params = [{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights': ['uniform','distance']}]
classifiers = dict([
('ExtraTrees',
Pipeline([('select', SelectFromModel(ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))),
('clf', ExtraTreesClassifier(n_estimators=100,class_weight='balanced'))])),
('Random Forest', RandomForestClassifier(n_estimators=100)),
('Decision Tree', DecisionTreeClassifier(max_depth=5)),
('RBF SVM', GridSearchCV(SVC(probability=True), param_grid=rbf_parameters)),
('Naive Bayes', GaussianNB()),
('Neural Network', MLPClassifier(alpha=1, max_iter=1000)),
('Nearest Neighbors', GridSearchCV(KNeighborsClassifier(), param_grid=knearest_params)),
('Linear SVM', GridSearchCV(SVC(probability=True), param_grid=linear_parameters)),
('AdaBoost', AdaBoostClassifier()),
('Logistic Regression', LogisticRegressionCV(solver = 'liblinear', penalty='l1')) ])
train_index, test_index = train_test_splits[train_test_index]
pipe = Pipeline([('std', StandardScaler()) , (clf, classifiers[clf])])
shaps = None
if permute:
pipe.fit(X[train_index], y[np.random.permutation(train_index)])
else:
pipe.fit(X[train_index], y[train_index])
import shap
if 'SVM' in clf:
explainer = shap.KernelExplainer(pipe.predict_proba, X[train_index])
else:
explainer = shap.KernelExplainer(pipe.predict, X[train_index])
shaps = explainer.shap_values(X[test_index], nsamples=100)
auc = roc_auc_score(y[test_index], pipe.predict(X[test_index]))
return auc, shaps
names = ['ExtraTrees', 'Random Forest', 'Decision Tree', 'Naive Bayes',
'Neural Network', 'AdaBoost', 'Logistic Regression']
# names = ['RBF SVM', 'Linear SVM', 'Nearest Neighbors']
clf_task = train_test(clf=names, X=X, y=y, groups=groups,
permute=[True, False],
train_test_splits=train_test_splits,
train_test_index=list(range(100)),
cache_dir=os.path.join(os.getcwd(),
'cache'))
clf_task.split(['clf', 'permute', 'train_test_index'])
clf_task.combine(['permute', 'train_test_index'])
# print(clf_task.output_dir)
print('Submitting')
with pydra.Submitter(plugin="cf", n_procs=int(int(os.environ['SLURM_CPUS_PER_TASK'])*0.75)) as sub:
sub(runnable=clf_task)
import pickle as pk
with open('clf-task-results.pkl', 'wb') as fp:
pk.dump((names, [True, False], 100, feature_names, clf_task.result()), fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment