Skip to content

Instantly share code, notes, and snippets.

View import
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt
View read_info
data_file_path = 'diabetes.csv'
data_df = pd.read_csv(data_file_path)
data_df.head()
View target_input
y = data_df["Outcome"].values
x = data_df.drop(["Outcome"],axis=1)
View gist:5f1e1c7e7fe997b15dac6d523f620c66
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data_df = ss.fit_transform(data_df)
#Divide into training and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) # 70% training and 30% test
View loop_k_values
train_score = []
test_score = []
k_vals = []
for k in range(1, 201, 5):
k_vals.append(k)
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
tr_score = knn.score(X_train, y_train)
View plot_k_values
plt.figure(figsize=(10,5))
plt.xlabel('Different Values of K')
plt.ylabel('Model score')
plt.plot(k_vals, train_score, color = 'r', label = "training score")
plt.plot(k_vals, test_score, color = 'b', label = 'test score')
plt.legend(bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure)
plt.show()
View knn_score
knn = KNeighborsClassifier(n_neighbors = 14)
#Fit the model
knn.fit(X_train,y_train)
#get the score
knn.score(X_test,y_test)
View 10_roc_curve.py
y_pred_proba = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
View read_info
data_file_path = '../input/heart-disease-uci/heart.csv'
data_df = pd.read_csv(data_file_path)
#To get information on the number of entries and the datatypes of the features
data_df.info()
#To check for missing values
print(data_df.isnull().sum())
View gist:d79869206c52d46885acb0be7f94f88c
#2. distribution of target variable.
sns.countplot(data_df['target'])
# Add labels
plt.title('Countplot of Target')
plt.xlabel('target')
plt.ylabel('Patients')
plt.show()