Skip to content

Instantly share code, notes, and snippets.

## score that comes from the testing set only
max_test_score = max(test_score)
test_scores_ind = [i for i, v in enumerate(test_score) if v == max_test_score]
print('Max test score {} and k = {}'.format(max_test_score * 100,list(map(lambda x: x+1, test_scores_ind))))
@purva91
purva91 / split_data
Last active September 10, 2019 12:42
y = data_df["target"].values
x = data_df.drop(["target"],axis=1)
#Scaling - mandatory for knn
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x = ss.fit_transform(x)
#SPlitting into train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) # 70% training and 30% test
#2. distribution of target variable.
sns.countplot(data_df['target'])
# Add labels
plt.title('Countplot of Target')
plt.xlabel('target')
plt.ylabel('Patients')
plt.show()
data_file_path = '../input/heart-disease-uci/heart.csv'
data_df = pd.read_csv(data_file_path)
#To get information on the number of entries and the datatypes of the features
data_df.info()
#To check for missing values
print(data_df.isnull().sum())
knn = KNeighborsClassifier(n_neighbors = 14)
#Fit the model
knn.fit(X_train,y_train)
#get the score
knn.score(X_test,y_test)
plt.figure(figsize=(10,5))
plt.xlabel('Different Values of K')
plt.ylabel('Model score')
plt.plot(k_vals, train_score, color = 'r', label = "training score")
plt.plot(k_vals, test_score, color = 'b', label = 'test score')
plt.legend(bbox_to_anchor=(1, 1),
bbox_transform=plt.gcf().transFigure)
plt.show()
train_score = []
test_score = []
k_vals = []
for k in range(1, 201, 5):
k_vals.append(k)
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
tr_score = knn.score(X_train, y_train)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data_df = ss.fit_transform(data_df)
#Divide into training and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) # 70% training and 30% test
y = data_df["Outcome"].values
x = data_df.drop(["Outcome"],axis=1)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt