Skip to content

Instantly share code, notes, and snippets.

@3dimaging
Created May 7, 2019 20:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 3dimaging/ed36576992599ee971b2e9b836109197 to your computer and use it in GitHub Desktop.
Save 3dimaging/ed36576992599ee971b2e9b836109197 to your computer and use it in GitHub Desktop.
updated050719
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, auc, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import pickle
#read in files
X_tumor = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_tumor_9_0502_Method_II_no_norm.csv')
X_normal = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_normal_9_0502_Method_II_no_norm.csv')
X_real_test = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_test_9_0502_Method_II_no_norm.csv')
ref = pd.read_csv('/home/wzli/Downloads/reference_with_results_07.csv')
#construct training data for X
X = pd.concat([X_tumor, X_normal])
X_train = X[X.columns[3:]]
#contruct training data for y
y = X['tumor']
#construct test data input
X_real_test = X_real_test[X_real_test.columns[3:]]
#before train the model. split dataset
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2)
# model training
clf=RandomForestClassifier(n_estimators=300, max_features=20, max_depth=10)
clf.fit(X_train,y_train)
#save the model
model_name = "RFmodel_Method_II_noise_only_07_new_trained.pkl"
with open(model_name, 'wb') as file:
pickle.dump(clf, file)
#model testing
y_pred=clf.predict_proba(X_test)[:, 1]
roc_value = roc_auc_score(y_test, y_pred)
#do prediction for test dataset
scores = clf.predict_proba(X_real_test)[:,1]
roc_value_test = roc_auc_score(ref['truth'], scores)
#draw roc curves
from sklearn.metrics import roc_curve
base_fpr, base_tpr, _ = roc_curve(ref['truth'], [1 for _ in range(len(ref['truth']))])
model_fpr, model_tpr, _ = roc_curve(ref['truth'], scores)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend();
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show();
# add predicted result to the reference file
ref['scores_method_II_07'] = pd.Series(scores)
ref.to_csv('/home/wzli/Downloads/reference_with_updated_results.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment