purva91/10_roc_curve.py

## 10_roc_curve.py
y_pred_proba = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

## 11_roc_plot.py
plt.figure(figsize = (10,8))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Knn')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Knn(n_neighbors = 8) ROC curve')
plt.show()

## 12_roc_auc.py
roc_auc_score(y_test, y_pred_proba)

## 13_prc_auc.py
# calculate precision-recall AUC
auc_prc = auc(recall, precision)
print(auc_prc)

## 14_prc_plot.py
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize = (10,8))
plt.plot([0, 1], [0.5, 0.5],'k--')
plt.plot(recall, precision, label = 'Knn')
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('Knn(n_neighbors = 8) PRC curve')
plt.show()

## 1_import_prec_rec.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2_read_info.py
data_file_path = '../input/heart-disease-uci/heart.csv'
data_df = pd.read_csv(data_file_path)

#To get information on the number of entries and the datatypes of the features
data_df.head()

## 3_check_dist.py
#2. distribution of target variable.
sns.countplot(data_df['target'])

# Add labels
plt.title('Countplot of Target')
plt.xlabel('target')
plt.ylabel('Patients')
plt.show()

## 4_split_data.py
y = data_df["target"].values
x = data_df.drop(["target"], axis = 1)

#Scaling - mandatory for knn
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x = ss.fit_transform(x)

#SPlitting into train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) # 70% training and 30% test

## 5_train_test_score.py
train_score = []
test_score = []
k_vals = []

for k in range(1, 21):
    k_vals.append(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)

    tr_score = knn.score(X_train, y_train)
    train_score.append(tr_score)

    te_score = knn.score(X_test, y_test)
    test_score.append(te_score)

## 6_max_test_score.py
## score that comes from the testing set only
max_test_score = max(test_score)
test_scores_ind = [i for i, v in enumerate(test_score) if v == max_test_score]
print('Max test score {} and k = {}'.format(max_test_score * 100, list(map(lambda x: x + 1, test_scores_ind))))

## 7_final_fit.py
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(3)

knn.fit(X_train, y_train)
knn.score(X_test, y_test)

## 8_conf_matrix.py
y_pred = knn.predict(X_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

## 9_class_report.py
print(classification_report(y_test, y_pred))

## isnull.py
data_df.isnull().sum()
	y_pred_proba = knn.predict_proba(X_test)[:,1]
	fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
	plt.figure(figsize = (10,8))
	plt.plot([0, 1], [0, 1], 'k--')
	plt.plot(fpr, tpr, label='Knn')
	plt.xlabel('FPR')
	plt.ylabel('TPR')
	plt.title('Knn(n_neighbors = 8) ROC curve')
	plt.show()
	# calculate precision-recall AUC
	auc_prc = auc(recall, precision)
	print(auc_prc)
	precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

	plt.figure(figsize = (10,8))
	plt.plot([0, 1], [0.5, 0.5],'k--')
	plt.plot(recall, precision, label = 'Knn')
	plt.xlabel('recall')
	plt.ylabel('precision')
	plt.title('Knn(n_neighbors = 8) PRC curve')
	plt.show()
	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import classification_report
	from sklearn.metrics import roc_curve
	from sklearn.metrics import roc_auc_score
	from sklearn.metrics import precision_recall_curve
	from sklearn.metrics import auc
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline
	data_file_path = '../input/heart-disease-uci/heart.csv'
	data_df = pd.read_csv(data_file_path)

	#To get information on the number of entries and the datatypes of the features
	data_df.head()
	#2. distribution of target variable.
	sns.countplot(data_df['target'])

	# Add labels
	plt.title('Countplot of Target')
	plt.xlabel('target')
	plt.ylabel('Patients')
	plt.show()
	y = data_df["target"].values
	x = data_df.drop(["target"], axis = 1)

	#Scaling - mandatory for knn
	from sklearn.preprocessing import StandardScaler
	ss = StandardScaler()
	x = ss.fit_transform(x)

	#SPlitting into train and test
	X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) # 70% training and 30% test
	train_score = []
	test_score = []
	k_vals = []

	for k in range(1, 21):
	k_vals.append(k)
	knn = KNeighborsClassifier(n_neighbors = k)
	knn.fit(X_train, y_train)

	tr_score = knn.score(X_train, y_train)
	train_score.append(tr_score)

	te_score = knn.score(X_test, y_test)
	test_score.append(te_score)
	## score that comes from the testing set only
	max_test_score = max(test_score)
	test_scores_ind = [i for i, v in enumerate(test_score) if v == max_test_score]
	print('Max test score {} and k = {}'.format(max_test_score * 100, list(map(lambda x: x + 1, test_scores_ind))))