Skip to content

Instantly share code, notes, and snippets.

Last active January 4, 2022 16:07
Show Gist options
  • Save SuperKogito/a8b1980105557f034ac9b4dadc331380 to your computer and use it in GitHub Desktop.
Save SuperKogito/a8b1980105557f034ac9b4dadc331380 to your computer and use it in GitHub Desktop.
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
def plot_correlation(data):
plot correlation's matrix to explore dependency between features
# init figure size
rcParams['figure.figsize'] = 15, 20
fig = plt.figure()
sns.heatmap(data.corr(), annot=True, fmt=".2f")
def plot_densities(data):
Plot features densities depending on the outcome values
# change fig size to fit all subplots beautifully
rcParams['figure.figsize'] = 15, 20
# separate data based on outcome values
outcome_0 = data[data['Outcome'] == 0]
outcome_1 = data[data['Outcome'] == 1]
# init figure
fig, axs = plt.subplots(8, 1)
fig.suptitle('Features densities for different outcomes 0/1')
plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95,
wspace = 0.2, hspace = 0.9)
# plot densities for outcomes
for column_name in names[:-1]:
ax = axs[names.index(column_name)]
#plt.subplot(4, 2, names.index(column_name) + 1)
outcome_0[column_name].plot(kind='density', ax=ax, subplots=True,
sharex=False, color="red", legend=True,
label=column_name + ' for Outcome = 0')
outcome_1[column_name].plot(kind='density', ax=ax, subplots=True,
sharex=False, color="green", legend=True,
label=column_name + ' for Outcome = 1')
ax.set_xlabel(column_name + ' values')
ax.set_title(column_name + ' density')
def accuracy(k, X_train, y_train, X_test, y_test):
compute accuracy of the classification based on k values
# instantiate learning model and fit data
knn = KNeighborsClassifier(n_neighbors=k), y_train)
# predict the response
pred = knn.predict(X_test)
# evaluate and return accuracy
return accuracy_score(y_test, pred)
def classify_and_plot(X, y):
split data, fit, classify, plot and evaluate results
# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)
best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1
print('For best accuracy use k = ', best_n_neighbours)
# init vars
n_neighbors = best_n_neighbours
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#0000FF'])
rcParams['figure.figsize'] = 5, 5
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights), y_train)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
fig.savefig(weights +'.png')
# evaluate
y_expected = y_test
y_predicted = clf.predict(X_test)
# print results
print('Classification report')
print('\n', classification_report(y_expected, y_predicted))
print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
# load your data
data = pd.read_csv('diabetes.csv')
names = list(data.columns)
# plot correlation & densities
# we only take the best two features and prepare them for the KNN classifier
rows_nbr = 30#data.shape[0]
X_prime = np.array(data.iloc[:rows_nbr, [1,5]])
#X = preprocessing.scale(X_prime)
X = X_prime
y = np.array(data.iloc[:rows_nbr, 8])
# classify, evaluate and plot results
classify_and_plot(X, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment