3dimaging/RF_training.py

## RF_training.py
import numpy as np
from sklearn.metrics import roc_curve,  precision_recall_curve, auc, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import pickle
#read in files
X_tumor = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_tumor_9_0502_Method_II_no_norm.csv')
X_normal = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_normal_9_0502_Method_II_no_norm.csv')
X_real_test = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_test_9_0502_Method_II_no_norm.csv')
ref = pd.read_csv('/home/wzli/Downloads/reference_with_results_07.csv')
#construct training data for X
X = pd.concat([X_tumor, X_normal])
X_train = X[X.columns[3:]]

#contruct training data for y
y = X['tumor']

#construct test data input
X_real_test = X_real_test[X_real_test.columns[3:]]

#before train the model. split dataset
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2)

# model training

clf=RandomForestClassifier(n_estimators=300, max_features=20, max_depth=10)
clf.fit(X_train,y_train)

#save the model

model_name = "RFmodel_Method_II_noise_only_07_new_trained.pkl"
with open(model_name, 'wb') as file:
    pickle.dump(clf, file)

#model testing
y_pred=clf.predict_proba(X_test)[:, 1]
roc_value = roc_auc_score(y_test, y_pred)


#do prediction for test dataset
scores = clf.predict_proba(X_real_test)[:,1]
roc_value_test = roc_auc_score(ref['truth'], scores)

#draw roc curves
from sklearn.metrics import roc_curve
base_fpr, base_tpr, _ = roc_curve(ref['truth'], [1 for _ in range(len(ref['truth']))])
model_fpr, model_tpr, _ = roc_curve(ref['truth'], scores)

plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16

    # Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend();
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show();

# add predicted result to the reference file
ref['scores_method_II_07'] = pd.Series(scores)
ref.to_csv('/home/wzli/Downloads/reference_with_updated_results.csv')
	import numpy as np
	from sklearn.metrics import roc_curve, precision_recall_curve, auc, average_precision_score
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import roc_auc_score
	import pandas as pd
	import pickle
	#read in files
	X_tumor = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_tumor_9_0502_Method_II_no_norm.csv')
	X_normal = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_normal_9_0502_Method_II_no_norm.csv')
	X_real_test = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_test_9_0502_Method_II_no_norm.csv')
	ref = pd.read_csv('/home/wzli/Downloads/reference_with_results_07.csv')
	#construct training data for X
	X = pd.concat([X_tumor, X_normal])
	X_train = X[X.columns[3:]]

	#contruct training data for y
	y = X['tumor']

	#construct test data input
	X_real_test = X_real_test[X_real_test.columns[3:]]

	#before train the model. split dataset
	X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2)

	# model training

	clf=RandomForestClassifier(n_estimators=300, max_features=20, max_depth=10)
	clf.fit(X_train,y_train)

	#save the model

	model_name = "RFmodel_Method_II_noise_only_07_new_trained.pkl"
	with open(model_name, 'wb') as file:
	pickle.dump(clf, file)

	#model testing
	y_pred=clf.predict_proba(X_test)[:, 1]
	roc_value = roc_auc_score(y_test, y_pred)


	#do prediction for test dataset
	scores = clf.predict_proba(X_real_test)[:,1]
	roc_value_test = roc_auc_score(ref['truth'], scores)

	#draw roc curves
	from sklearn.metrics import roc_curve
	base_fpr, base_tpr, _ = roc_curve(ref['truth'], [1 for _ in range(len(ref['truth']))])
	model_fpr, model_tpr, _ = roc_curve(ref['truth'], scores)

	plt.figure(figsize = (8, 6))
	plt.rcParams['font.size'] = 16

	# Plot both curves
	plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
	plt.plot(model_fpr, model_tpr, 'r', label = 'model')
	plt.legend();
	plt.xlabel('False Positive Rate');
	plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
	plt.show();

	# add predicted result to the reference file
	ref['scores_method_II_07'] = pd.Series(scores)
	ref.to_csv('/home/wzli/Downloads/reference_with_updated_results.csv')