Created
February 17, 2021 11:30
-
-
Save kkmatsuo/e77e78a0346280d5570829164760132f to your computer and use it in GitHub Desktop.
Code_for_the_prediction_of_ischemic_outcome_after_treatment_against_ICAS_with_machine_learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use patients data. 17 clinical factors with 1 outcome in last column. | |
data= pd.read_csv("my_file.csv") | |
# preparation for missing value imputation | |
use_df = data.iloc[:, :18] | |
# K-Nearest neighbors imputation | |
from sklearn.impute import KNNImputer | |
imputer = KNNImputer(n_neighbors=4, weights="distance") | |
imputed_df = imputer.fit_transform(use_df) | |
# Standardization | |
scaler = StandardScaler() | |
scaler.fit(imputed_df.iloc[:, :-1]) | |
imputed_df.iloc[:, :-1] = scaler.transform(imputed_df.iloc[:, :-1]) | |
imputed_df.iloc[:, 17] = imputed_df.iloc[:, 17].astype("uint8") | |
print (imputed_df) | |
# Split the dataset | |
test_df = imputed_df.loc["test"] | |
train_df = imputed_df.loc["train"] | |
print('number of Train data:', len(train_df)) | |
print('number of Test data:', len(test_df)) | |
# create X and y | |
X_train = train_df.iloc[:, :-1] | |
y_train = train_df.iloc[:, -1:].values.ravel() | |
X_test = test_df.iloc[:, :-1] | |
y_test = test_df.iloc[:, -1:].values.ravel() | |
# Evaluate scikit-learn models with 10-repeated 5-fold stratified cross validation | |
# to re-define sensitivity and specificity for cross validaion | |
def sensitivity1(y_test, y_pred): | |
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() | |
recall = tp / (tp+fn) | |
return recall | |
def specificity1(y_test, y_pred): | |
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() | |
specificity = tn / (tn+fp) | |
return specificity | |
from sklearn.model_selection import RepeatedStratifiedKFold | |
def score_calc (clf, cross_k_val = 5, repeat = 10, seed = 77): | |
sensitivity2 = make_scorer(sensitivity1, greater_is_better=True) | |
specificity2 = make_scorer(specificity1, greater_is_better=True) | |
# set 7 scores | |
scores = ["roc_auc", sensitivity2, specificity2, 'precision', "accuracy", "neg_log_loss"] | |
# 10times Stratified k-fold cross-validation | |
stratifiedkfold = RepeatedStratifiedKFold(n_splits=cross_k_val, | |
n_repeats=repeat, | |
random_state=seed | |
) | |
print (clf) | |
print ("------------------------------------------------------------") | |
# calculate each score | |
for score in scores: | |
answer = cross_val_score(clf, X_train, y_train, cv = stratifiedkfold, scoring= score) | |
print ("eval metric:" + str(score)) | |
mean_val = np.mean(answer) | |
std = np.std(answer) | |
print ('Average score (SD):', mean_val, '±', std) | |
# 95% CI | |
alpha = 0.95 | |
sem_val = stats.sem(answer) | |
ci = stats.t.interval(alpha, len(answer)-1, loc = mean_val, scale = sem_val) | |
print ('95% CI: {}'.format(ci)) | |
print ("---------------------------------------------------------------") | |
# For optimization of scikit-learn models | |
def param_tune(clf, X_train, y_train, score, param): | |
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=55) | |
gsearch = GridSearchCV( | |
estimator = clf, | |
param_grid = param, | |
scoring=score, | |
n_jobs=2, iid=False, cv=cv5 | |
) | |
gsearch.fit(X_train, y_train) | |
print('Best parameters: {}'.format(gsearch.best_params_)) | |
print('Best cross-validation: {}'.format(gsearch.best_score_)) | |
# set parameters on grid-search | |
score = "neg_log_loss" | |
# Example of hyperparameter tuning | |
LR = LogisticRegression( | |
C=10, fit_intercept=True, penalty="l2", solver='lbfgs', random_state=27 | |
) | |
param_test1 = { | |
'C': [0.01, 0.1, 1, 10, 100], | |
'solver':['newton-cg', 'sag', 'saga', 'lbfgs'] | |
} | |
param_tune(clf1, X_train, y_train, score, param_test1) | |
# Example of cross-validation on tuned regression model | |
LR3 = LogisticRegression( | |
C=0.075, fit_intercept=True, penalty="l2", solver='saga', random_state=27 | |
) | |
score_calc(LR3) | |
# Bootstrap evaluation on test datasets in scikit learn models | |
from scipy import stats | |
def boot_eval(clf, data, X_train, y_train, cnt=1100): | |
scores_acc=[] | |
scores_recall=[] | |
scores_spec=[] | |
scores_ppv=[] | |
print("Model is ", clf) | |
# fit the model | |
clf.fit(X_train, y_train) | |
count = 0 | |
for i in range(cnt): | |
df = data.sample(n=len(data.values), replace=True, random_state=i) | |
y = df.iloc[:, -1] | |
X = df.iloc[:, 0:-1] | |
Bootpredictions = clf.predict(X) | |
cm = confusion_matrix(y, Bootpredictions).ravel() | |
if not cm.shape[0] == 4: | |
print("Excluded because the data is non-uniform and cannot be calculated", cm.shape[0]) | |
count +=1 | |
continue | |
tn = cm[0] | |
fp = cm[1] | |
fn = cm[2] | |
tp = cm[3] | |
acc=(tp+tn)/(tn+fp+fn+tp) | |
specificity = tn / (tn+fp) | |
recall = tp / (fn+tp) | |
precision = tp / (tp+fp) | |
if (tp+fp) is 0: | |
count +=1 | |
continue | |
# Calculate each metrics | |
scores_acc.append(acc) | |
scores_recall.append(recall) | |
scores_spec.append(specificity) | |
scores_ppv.append(precision) | |
if len(scores_acc) == 1000: | |
break | |
print("====================================================") | |
print ('Skipped for Nan answer', count, 'times...') | |
print (len(scores_acc), 'times bootstrap caluculation was done.') | |
mean_acc = np.nanmean(scores_acc) | |
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error | |
ci_acc = [mean_acc - 1.96*se_acc, mean_acc + 1.96*se_acc] | |
print ('Accuracy mean:', mean_acc) | |
print ('95%CI: ', ci_acc) | |
mean_recall = np.nanmean(scores_recall) | |
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall)) # standared error | |
ci_recall = [mean_recall - 1.96*se_recall, mean_recall + 1.96*se_recall] | |
print ('Sensitivity mean:', mean_recall) | |
print ('95%CI: ', ci_recall) | |
mean_spec = np.nanmean(scores_spec) | |
se_spec = np.nanstd(scores_spec, ddof=1) / np.sqrt(len(scores_spec)) # standared error | |
ci_spec = [mean_spec - 1.96*se_spec, mean_spec + 1.96*se_spec] | |
print ('Specificity mean:', mean_spec) | |
print ('95%CI: ', ci_spec) | |
mean_ppv = np.nanmean(scores_ppv) | |
se_ppv = np.nanstd(scores_ppv, ddof=1) / np.sqrt(len(scores_ppv)) # standared error | |
ci_ppv = [mean_ppv - 1.96*se_ppv, mean_ppv + 1.96*se_ppv] | |
print ('ppv mean:', mean_ppv) | |
print ('95%CI: ', ci_ppv) | |
print("=======================================================") | |
# Example of bootstrap evaluation on regression model | |
boot_eval(LR3, test_df, X_train, y_train, cnt=1100) | |
# Cross validation of XGBoost model | |
def xgb_modelfit_all(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=20): | |
if useTrainCV: | |
xgb_param = alg.get_xgb_params() | |
xgtrain = xgb.DMatrix(X_train, label=y_train) | |
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, | |
metrics= ['auc', 'logloss'], stratified = 'TRUE', early_stopping_rounds=early_stopping_rounds) | |
#Fit the algorithm on the data | |
evals_result = {} | |
alg.fit(X_train, y_train, | |
# objective function = logloss, auc | |
eval_metric= ['auc', 'logloss'], | |
eval_set=[(X_train, y_train), (X_test, y_test)], | |
early_stopping_rounds=early_stopping_rounds, | |
callbacks=[xgb.callback.record_evaluation(evals_result)] | |
) | |
#Predict training set: | |
dval_predictions = alg.predict(X_test) | |
dval_predprob = alg.predict_proba(X_test)[:,1] | |
acc = metrics.accuracy_score(y_test, dval_predictions) | |
roc = metrics.roc_auc_score(y_test, dval_predprob) | |
loglossval = log_loss(y_test, dval_predprob) | |
tn, fp, fn, tp = confusion_matrix(y_test, dval_predictions).ravel() | |
specificity = tn / (tn+fp) | |
recall = tp / (tp+fn) | |
precision = tp / (tp+fp) | |
#Print model report: | |
print ("\nModel Report") | |
print ("Accuracy:", acc) | |
print ("Sensitivity:", recall) | |
print ("Specificity:", specificity) | |
print ("Precision:", precision) | |
print('ROC-AUC :', roc) | |
print('log loss :', loglossval) | |
return acc, recall, specificity, precision, roc, loglossval | |
def xgb_on_cross_val(xgb, X_train, y_train, seed=77, stop_n=20): | |
scores_acc = [] | |
scores_recall = [] | |
scores_specificity = [] | |
scores_precision = [] | |
scores_roc = [] | |
scores_logloss = [] | |
# 10times Stratified k-fold cross-validation | |
stratifiedkfold = RepeatedStratifiedKFold(n_splits=5, | |
n_repeats=10, | |
random_state=seed | |
) | |
for tr_idx, va_idx in stratifiedkfold.split(X_train, y_train): | |
tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx] | |
tr_y, va_y = y_train[tr_idx], y_train[va_idx] | |
print("Divide...") | |
acc, recall, specificity, precision, roc, loglossval = xgb_modelfit_all( | |
xgb, tr_x, tr_y, va_x, va_y, early_stopping_rounds=stop_n) | |
scores_acc.append(acc) | |
scores_recall.append(recall) | |
scores_specificity.append(specificity) | |
scores_precision.append(precision) | |
scores_roc.append(roc) | |
scores_logloss.append(loglossval) | |
# Calculate the mean | |
print('\n Mean results of 5-fold cross validation on XGBoost: \n') | |
print(xgb) | |
acc_avg = np.nanmean(scores_acc) | |
recall_avg = np.nanmean(scores_recall) | |
specificity_avg = np.nanmean(scores_specificity) | |
precision_avg = np.nanmean(scores_precision) | |
roc_avg = np.nanmean(scores_roc) | |
logloss_avg = np.nanmean(scores_logloss) | |
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error | |
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall)) | |
se_specificity = np.nanstd(scores_specificity, ddof=1) / np.sqrt(len(scores_specificity)) | |
se_precision = np.nanstd(scores_precision, ddof=1) / np.sqrt(len(scores_precision)) | |
se_roc = np.nanstd(scores_roc, ddof=1) / np.sqrt(len(scores_roc)) | |
# calculate with 95% CI (t = 2.776 in 5 sample) mean ± 2.776*SE | |
print('AUC-ROC mean (95%CI):', roc_avg, ",", roc_avg - 1.96*se_roc, ",", roc_avg + 1.96*se_roc) | |
print('Accuracymean (95%CI):', acc_avg, ",", acc_avg - 1.96*se_acc, ",", acc_avg + 1.96*se_acc) | |
print('Sensitivity mean (95%CI):', recall_avg, ",", recall_avg - 1.96*se_recall, ",", recall_avg + 1.96*se_recall) | |
print('Specificity mean (95%CI):', specificity_avg, ",", specificity_avg - 1.96*se_specificity, ",", specificity_avg + 1.96*se_specificity) | |
print('PPV mean (95%CI):', precision_avg, ",", precision_avg - 1.96*se_precision, ",", precision_avg + 1.96*se_precision) | |
print('logloss mean:', logloss_avg) | |
# evaluation of XGBoost model | |
xgb01 = xgb.XGBClassifier( | |
learning_rate =0.3, | |
n_estimators=360, | |
max_depth=1, | |
min_child_weight=6.5, | |
gamma=0.95, | |
subsample=0.65, | |
colsample_bytree=0.65, | |
objective= 'binary:logistic', | |
reg_alpha=0.001, | |
reg_lambda=0.1, | |
max_delta_step=1, | |
tree_method='hist', | |
nthread=4, | |
seed=27) | |
xgb_on_cross_val(xgb01, X_train, y_train, stop_n=20) | |
# Bootstrap evaluation of XGBoost on test dataset | |
def boot_xgb_eval(alg, data, X_train, y_train, cnt=1100): | |
scores_acc=[] | |
scores_recall=[] | |
scores_spec=[] | |
scores_ppv=[] | |
print("XGBoost model is", alg) | |
# model fit | |
evals_result = {} | |
alg.fit(X_train, y_train, | |
early_stopping_rounds=10, | |
eval_set=[(X_train, y_train), (X_test, y_test)], | |
eval_metric= ['auc', 'logloss'], | |
callbacks=[xgb.callback.record_evaluation(evals_result)] | |
) | |
count = 0 | |
for i in range(cnt): | |
df = data.sample(n=len(data.values), replace=True, random_state=i) | |
y_ = df.iloc[:, -1] | |
X_ = df.iloc[:, 0:-1] | |
y_pred = alg.predict(X_) | |
cm = confusion_matrix(y_, y_pred).ravel() | |
if not cm.shape[0] == 4: | |
print("Excluded because the data is non-uniform and cannot be calculated", cm.shape[0]) | |
count +=1 | |
continue | |
tn = cm[0] | |
fp = cm[1] | |
fn = cm[2] | |
tp = cm[3] | |
acc=(tp+tn)/(tn+fp+fn+tp) | |
specificity = tn / (tn+fp) | |
recall = tp / (fn+tp) | |
precision = tp / (tp+fp) | |
# append the answers | |
scores_acc.append(acc) | |
scores_recall.append(recall) | |
scores_spec.append(specificity) | |
scores_ppv.append(precision) | |
if len(scores_acc) == 1000: | |
break | |
print("====================================================") | |
print ('Skipped by Nan answer', count, 'times.') | |
print (len(scores_acc), 'times calculation was done.') | |
mean_acc = np.nanmean(scores_acc) | |
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error | |
ci_acc = [mean_acc - 1.96*se_acc, mean_acc + 1.96*se_acc] | |
print ('Accuracy mean:', mean_acc) | |
print ('95%CI: ', ci_acc) | |
mean_recall = np.nanmean(scores_recall) | |
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall)) # standared error | |
ci_recall = [mean_recall - 1.96*se_recall, mean_recall + 1.96*se_recall] | |
print ('Sensitivity mean:', mean_recall) | |
print ('95%CI: ', ci_recall) | |
mean_spec = np.nanmean(scores_spec) | |
se_spec = np.nanstd(scores_spec, ddof=1) / np.sqrt(len(scores_spec)) # standared error | |
ci_spec = [mean_spec - 1.96*se_spec, mean_spec + 1.96*se_spec] | |
print ('Specificity mean:', mean_spec) | |
print ('95%CI: ', ci_spec) | |
mean_ppv = np.nanmean(scores_ppv) | |
se_ppv = np.nanstd(scores_ppv, ddof=1) / np.sqrt(len(scores_ppv)) # standared error | |
ci_ppv = [mean_ppv - 1.96*se_ppv, mean_ppv + 1.96*se_ppv] | |
print ('PPV mean:', mean_ppv) | |
print ('95%CI: ', ci_ppv) | |
print("=======================================================") | |
# Bootstrap evaluation of XGBoost on test dataset, example | |
boot_xgb_eval( | |
xgb10, test_df, X_train, y_train, cnt=1100 | |
) | |
# Feature importance measurement according to XGBoost | |
def xgb_modelfit_feat2(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=20): | |
if useTrainCV: | |
xgb_param = alg.get_xgb_params() | |
xgtrain = xgb.DMatrix(X_train, label=y_train) | |
xgtest = xgb.DMatrix(X_test, label=y_test) | |
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, | |
metrics= ['logloss', 'auc'], early_stopping_rounds=early_stopping_rounds) | |
#Fit the algorithm on the data | |
evals_result = {} | |
alg.fit(X_train, y_train, | |
eval_metric= ['auc', 'logloss'], | |
eval_set=[(X_train, y_train), (X_eval, y_eval)], | |
early_stopping_rounds=early_stopping_rounds, | |
callbacks=[xgb.callback.record_evaluation(evals_result)] | |
) | |
feat_imp = pd.Series(alg.get_booster().get_score(importance_type='total_gain')).sort_values(ascending=False) | |
feat_imp.plot(kind='bar', title='Feature Importances') | |
plt.ylabel('Feature Importance Score') | |
return feat_imp | |
# Example | |
feat_imp = xgb_modelfit_feat(xgb8, X_train, y_train, X_test, y_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
They were used in my research which was reported in "Potential of machine learning to predict early ischemic events after carotid endarterectomy or stenting: A comparison with surgeon predictions."