Skip to content

Instantly share code, notes, and snippets.

Created May 7, 2019 20:40
What would you like to do?
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, auc, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import pickle
#read in files
X_tumor = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_tumor_9_0502_Method_II_no_norm.csv')
X_normal = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_normal_9_0502_Method_II_no_norm.csv')
X_real_test = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_test_9_0502_Method_II_no_norm.csv')
ref = pd.read_csv('/home/wzli/Downloads/reference_with_results_07.csv')
#construct training data for X
X = pd.concat([X_tumor, X_normal])
X_train = X[X.columns[3:]]
#contruct training data for y
y = X['tumor']
#construct test data input
X_real_test = X_real_test[X_real_test.columns[3:]]
#before train the model. split dataset
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2)
# model training
clf=RandomForestClassifier(n_estimators=300, max_features=20, max_depth=10),y_train)
#save the model
model_name = "RFmodel_Method_II_noise_only_07_new_trained.pkl"
with open(model_name, 'wb') as file:
pickle.dump(clf, file)
#model testing
y_pred=clf.predict_proba(X_test)[:, 1]
roc_value = roc_auc_score(y_test, y_pred)
#do prediction for test dataset
scores = clf.predict_proba(X_real_test)[:,1]
roc_value_test = roc_auc_score(ref['truth'], scores)
#draw roc curves
from sklearn.metrics import roc_curve
base_fpr, base_tpr, _ = roc_curve(ref['truth'], [1 for _ in range(len(ref['truth']))])
model_fpr, model_tpr, _ = roc_curve(ref['truth'], scores)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');;
# add predicted result to the reference file
ref['scores_method_II_07'] = pd.Series(scores)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment