Skip to content

Instantly share code, notes, and snippets.

@kkmatsuo
Created February 17, 2021 11:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkmatsuo/e77e78a0346280d5570829164760132f to your computer and use it in GitHub Desktop.
Save kkmatsuo/e77e78a0346280d5570829164760132f to your computer and use it in GitHub Desktop.
Code_for_the_prediction_of_ischemic_outcome_after_treatment_against_ICAS_with_machine_learning
# Use patients data. 17 clinical factors with 1 outcome in last column.
data= pd.read_csv("my_file.csv")
# preparation for missing value imputation
use_df = data.iloc[:, :18]
# K-Nearest neighbors imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=4, weights="distance")
imputed_df = imputer.fit_transform(use_df)
# Standardization
scaler = StandardScaler()
scaler.fit(imputed_df.iloc[:, :-1])
imputed_df.iloc[:, :-1] = scaler.transform(imputed_df.iloc[:, :-1])
imputed_df.iloc[:, 17] = imputed_df.iloc[:, 17].astype("uint8")
print (imputed_df)
# Split the dataset
test_df = imputed_df.loc["test"]
train_df = imputed_df.loc["train"]
print('number of Train data:', len(train_df))
print('number of Test data:', len(test_df))
# create X and y
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1:].values.ravel()
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1:].values.ravel()
# Evaluate scikit-learn models with 10-repeated 5-fold stratified cross validation
# to re-define sensitivity and specificity for cross validaion
def sensitivity1(y_test, y_pred):
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
recall = tp / (tp+fn)
return recall
def specificity1(y_test, y_pred):
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn+fp)
return specificity
from sklearn.model_selection import RepeatedStratifiedKFold
def score_calc (clf, cross_k_val = 5, repeat = 10, seed = 77):
sensitivity2 = make_scorer(sensitivity1, greater_is_better=True)
specificity2 = make_scorer(specificity1, greater_is_better=True)
# set 7 scores
scores = ["roc_auc", sensitivity2, specificity2, 'precision', "accuracy", "neg_log_loss"]
# 10times Stratified k-fold cross-validation
stratifiedkfold = RepeatedStratifiedKFold(n_splits=cross_k_val,
n_repeats=repeat,
random_state=seed
)
print (clf)
print ("------------------------------------------------------------")
# calculate each score
for score in scores:
answer = cross_val_score(clf, X_train, y_train, cv = stratifiedkfold, scoring= score)
print ("eval metric:" + str(score))
mean_val = np.mean(answer)
std = np.std(answer)
print ('Average score (SD):', mean_val, '±', std)
# 95% CI
alpha = 0.95
sem_val = stats.sem(answer)
ci = stats.t.interval(alpha, len(answer)-1, loc = mean_val, scale = sem_val)
print ('95% CI: {}'.format(ci))
print ("---------------------------------------------------------------")
# For optimization of scikit-learn models
def param_tune(clf, X_train, y_train, score, param):
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
gsearch = GridSearchCV(
estimator = clf,
param_grid = param,
scoring=score,
n_jobs=2, iid=False, cv=cv5
)
gsearch.fit(X_train, y_train)
print('Best parameters: {}'.format(gsearch.best_params_))
print('Best cross-validation: {}'.format(gsearch.best_score_))
# set parameters on grid-search
score = "neg_log_loss"
# Example of hyperparameter tuning
LR = LogisticRegression(
C=10, fit_intercept=True, penalty="l2", solver='lbfgs', random_state=27
)
param_test1 = {
'C': [0.01, 0.1, 1, 10, 100],
'solver':['newton-cg', 'sag', 'saga', 'lbfgs']
}
param_tune(clf1, X_train, y_train, score, param_test1)
# Example of cross-validation on tuned regression model
LR3 = LogisticRegression(
C=0.075, fit_intercept=True, penalty="l2", solver='saga', random_state=27
)
score_calc(LR3)
# Bootstrap evaluation on test datasets in scikit learn models
from scipy import stats
def boot_eval(clf, data, X_train, y_train, cnt=1100):
scores_acc=[]
scores_recall=[]
scores_spec=[]
scores_ppv=[]
print("Model is ", clf)
# fit the model
clf.fit(X_train, y_train)
count = 0
for i in range(cnt):
df = data.sample(n=len(data.values), replace=True, random_state=i)
y = df.iloc[:, -1]
X = df.iloc[:, 0:-1]
Bootpredictions = clf.predict(X)
cm = confusion_matrix(y, Bootpredictions).ravel()
if not cm.shape[0] == 4:
print("Excluded because the data is non-uniform and cannot be calculated", cm.shape[0])
count +=1
continue
tn = cm[0]
fp = cm[1]
fn = cm[2]
tp = cm[3]
acc=(tp+tn)/(tn+fp+fn+tp)
specificity = tn / (tn+fp)
recall = tp / (fn+tp)
precision = tp / (tp+fp)
if (tp+fp) is 0:
count +=1
continue
# Calculate each metrics
scores_acc.append(acc)
scores_recall.append(recall)
scores_spec.append(specificity)
scores_ppv.append(precision)
if len(scores_acc) == 1000:
break
print("====================================================")
print ('Skipped for Nan answer', count, 'times...')
print (len(scores_acc), 'times bootstrap caluculation was done.')
mean_acc = np.nanmean(scores_acc)
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error
ci_acc = [mean_acc - 1.96*se_acc, mean_acc + 1.96*se_acc]
print ('Accuracy mean:', mean_acc)
print ('95%CI: ', ci_acc)
mean_recall = np.nanmean(scores_recall)
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall)) # standared error
ci_recall = [mean_recall - 1.96*se_recall, mean_recall + 1.96*se_recall]
print ('Sensitivity mean:', mean_recall)
print ('95%CI: ', ci_recall)
mean_spec = np.nanmean(scores_spec)
se_spec = np.nanstd(scores_spec, ddof=1) / np.sqrt(len(scores_spec)) # standared error
ci_spec = [mean_spec - 1.96*se_spec, mean_spec + 1.96*se_spec]
print ('Specificity mean:', mean_spec)
print ('95%CI: ', ci_spec)
mean_ppv = np.nanmean(scores_ppv)
se_ppv = np.nanstd(scores_ppv, ddof=1) / np.sqrt(len(scores_ppv)) # standared error
ci_ppv = [mean_ppv - 1.96*se_ppv, mean_ppv + 1.96*se_ppv]
print ('ppv mean:', mean_ppv)
print ('95%CI: ', ci_ppv)
print("=======================================================")
# Example of bootstrap evaluation on regression model
boot_eval(LR3, test_df, X_train, y_train, cnt=1100)
# Cross validation of XGBoost model
def xgb_modelfit_all(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=20):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics= ['auc', 'logloss'], stratified = 'TRUE', early_stopping_rounds=early_stopping_rounds)
#Fit the algorithm on the data
evals_result = {}
alg.fit(X_train, y_train,
# objective function = logloss, auc
eval_metric= ['auc', 'logloss'],
eval_set=[(X_train, y_train), (X_test, y_test)],
early_stopping_rounds=early_stopping_rounds,
callbacks=[xgb.callback.record_evaluation(evals_result)]
)
#Predict training set:
dval_predictions = alg.predict(X_test)
dval_predprob = alg.predict_proba(X_test)[:,1]
acc = metrics.accuracy_score(y_test, dval_predictions)
roc = metrics.roc_auc_score(y_test, dval_predprob)
loglossval = log_loss(y_test, dval_predprob)
tn, fp, fn, tp = confusion_matrix(y_test, dval_predictions).ravel()
specificity = tn / (tn+fp)
recall = tp / (tp+fn)
precision = tp / (tp+fp)
#Print model report:
print ("\nModel Report")
print ("Accuracy:", acc)
print ("Sensitivity:", recall)
print ("Specificity:", specificity)
print ("Precision:", precision)
print('ROC-AUC :', roc)
print('log loss :', loglossval)
return acc, recall, specificity, precision, roc, loglossval
def xgb_on_cross_val(xgb, X_train, y_train, seed=77, stop_n=20):
scores_acc = []
scores_recall = []
scores_specificity = []
scores_precision = []
scores_roc = []
scores_logloss = []
# 10times Stratified k-fold cross-validation
stratifiedkfold = RepeatedStratifiedKFold(n_splits=5,
n_repeats=10,
random_state=seed
)
for tr_idx, va_idx in stratifiedkfold.split(X_train, y_train):
tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
tr_y, va_y = y_train[tr_idx], y_train[va_idx]
print("Divide...")
acc, recall, specificity, precision, roc, loglossval = xgb_modelfit_all(
xgb, tr_x, tr_y, va_x, va_y, early_stopping_rounds=stop_n)
scores_acc.append(acc)
scores_recall.append(recall)
scores_specificity.append(specificity)
scores_precision.append(precision)
scores_roc.append(roc)
scores_logloss.append(loglossval)
# Calculate the mean
print('\n Mean results of 5-fold cross validation on XGBoost: \n')
print(xgb)
acc_avg = np.nanmean(scores_acc)
recall_avg = np.nanmean(scores_recall)
specificity_avg = np.nanmean(scores_specificity)
precision_avg = np.nanmean(scores_precision)
roc_avg = np.nanmean(scores_roc)
logloss_avg = np.nanmean(scores_logloss)
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall))
se_specificity = np.nanstd(scores_specificity, ddof=1) / np.sqrt(len(scores_specificity))
se_precision = np.nanstd(scores_precision, ddof=1) / np.sqrt(len(scores_precision))
se_roc = np.nanstd(scores_roc, ddof=1) / np.sqrt(len(scores_roc))
# calculate with 95% CI (t = 2.776 in 5 sample) mean ± 2.776*SE
print('AUC-ROC mean (95%CI):', roc_avg, ",", roc_avg - 1.96*se_roc, ",", roc_avg + 1.96*se_roc)
print('Accuracymean (95%CI):', acc_avg, ",", acc_avg - 1.96*se_acc, ",", acc_avg + 1.96*se_acc)
print('Sensitivity mean (95%CI):', recall_avg, ",", recall_avg - 1.96*se_recall, ",", recall_avg + 1.96*se_recall)
print('Specificity mean (95%CI):', specificity_avg, ",", specificity_avg - 1.96*se_specificity, ",", specificity_avg + 1.96*se_specificity)
print('PPV mean (95%CI):', precision_avg, ",", precision_avg - 1.96*se_precision, ",", precision_avg + 1.96*se_precision)
print('logloss mean:', logloss_avg)
# evaluation of XGBoost model
xgb01 = xgb.XGBClassifier(
learning_rate =0.3,
n_estimators=360,
max_depth=1,
min_child_weight=6.5,
gamma=0.95,
subsample=0.65,
colsample_bytree=0.65,
objective= 'binary:logistic',
reg_alpha=0.001,
reg_lambda=0.1,
max_delta_step=1,
tree_method='hist',
nthread=4,
seed=27)
xgb_on_cross_val(xgb01, X_train, y_train, stop_n=20)
# Bootstrap evaluation of XGBoost on test dataset
def boot_xgb_eval(alg, data, X_train, y_train, cnt=1100):
scores_acc=[]
scores_recall=[]
scores_spec=[]
scores_ppv=[]
print("XGBoost model is", alg)
# model fit
evals_result = {}
alg.fit(X_train, y_train,
early_stopping_rounds=10,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric= ['auc', 'logloss'],
callbacks=[xgb.callback.record_evaluation(evals_result)]
)
count = 0
for i in range(cnt):
df = data.sample(n=len(data.values), replace=True, random_state=i)
y_ = df.iloc[:, -1]
X_ = df.iloc[:, 0:-1]
y_pred = alg.predict(X_)
cm = confusion_matrix(y_, y_pred).ravel()
if not cm.shape[0] == 4:
print("Excluded because the data is non-uniform and cannot be calculated", cm.shape[0])
count +=1
continue
tn = cm[0]
fp = cm[1]
fn = cm[2]
tp = cm[3]
acc=(tp+tn)/(tn+fp+fn+tp)
specificity = tn / (tn+fp)
recall = tp / (fn+tp)
precision = tp / (tp+fp)
# append the answers
scores_acc.append(acc)
scores_recall.append(recall)
scores_spec.append(specificity)
scores_ppv.append(precision)
if len(scores_acc) == 1000:
break
print("====================================================")
print ('Skipped by Nan answer', count, 'times.')
print (len(scores_acc), 'times calculation was done.')
mean_acc = np.nanmean(scores_acc)
se_acc = np.nanstd(scores_acc, ddof=1) / np.sqrt(len(scores_acc)) # standared error
ci_acc = [mean_acc - 1.96*se_acc, mean_acc + 1.96*se_acc]
print ('Accuracy mean:', mean_acc)
print ('95%CI: ', ci_acc)
mean_recall = np.nanmean(scores_recall)
se_recall = np.nanstd(scores_recall, ddof=1) / np.sqrt(len(scores_recall)) # standared error
ci_recall = [mean_recall - 1.96*se_recall, mean_recall + 1.96*se_recall]
print ('Sensitivity mean:', mean_recall)
print ('95%CI: ', ci_recall)
mean_spec = np.nanmean(scores_spec)
se_spec = np.nanstd(scores_spec, ddof=1) / np.sqrt(len(scores_spec)) # standared error
ci_spec = [mean_spec - 1.96*se_spec, mean_spec + 1.96*se_spec]
print ('Specificity mean:', mean_spec)
print ('95%CI: ', ci_spec)
mean_ppv = np.nanmean(scores_ppv)
se_ppv = np.nanstd(scores_ppv, ddof=1) / np.sqrt(len(scores_ppv)) # standared error
ci_ppv = [mean_ppv - 1.96*se_ppv, mean_ppv + 1.96*se_ppv]
print ('PPV mean:', mean_ppv)
print ('95%CI: ', ci_ppv)
print("=======================================================")
# Bootstrap evaluation of XGBoost on test dataset, example
boot_xgb_eval(
xgb10, test_df, X_train, y_train, cnt=1100
)
# Feature importance measurement according to XGBoost
def xgb_modelfit_feat2(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=20):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgtest = xgb.DMatrix(X_test, label=y_test)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics= ['logloss', 'auc'], early_stopping_rounds=early_stopping_rounds)
#Fit the algorithm on the data
evals_result = {}
alg.fit(X_train, y_train,
eval_metric= ['auc', 'logloss'],
eval_set=[(X_train, y_train), (X_eval, y_eval)],
early_stopping_rounds=early_stopping_rounds,
callbacks=[xgb.callback.record_evaluation(evals_result)]
)
feat_imp = pd.Series(alg.get_booster().get_score(importance_type='total_gain')).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
return feat_imp
# Example
feat_imp = xgb_modelfit_feat(xgb8, X_train, y_train, X_test, y_test)
@kkmatsuo
Copy link
Author

They were used in my research which was reported in "Potential of machine learning to predict early ischemic events after carotid endarterectomy or stenting: A comparison with surgeon predictions."

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment