SuperKogito/K-nearest-neighbor.py

## K-nearest-neighbor.py
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")


def plot_correlation(data):
    '''
    plot correlation's matrix to explore dependency between features
    '''
    # init figure size
    rcParams['figure.figsize'] = 15, 20
    fig = plt.figure()
    sns.heatmap(data.corr(), annot=True, fmt=".2f")
    plt.show()
    fig.savefig('corr.png')

def plot_densities(data):
    '''
    Plot features densities depending on the outcome values
    '''
    # change fig size to fit all subplots beautifully
    rcParams['figure.figsize'] = 15, 20

    # separate data based on outcome values
    outcome_0 = data[data['Outcome'] == 0]
    outcome_1 = data[data['Outcome'] == 1]

    # init figure
    fig, axs = plt.subplots(8, 1)
    fig.suptitle('Features densities for different outcomes 0/1')
    plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95,
                        wspace = 0.2, hspace = 0.9)

    # plot densities for outcomes
    for column_name in names[:-1]:
        ax = axs[names.index(column_name)]
        #plt.subplot(4, 2, names.index(column_name) + 1)
        outcome_0[column_name].plot(kind='density', ax=ax, subplots=True,
                                    sharex=False, color="red", legend=True,
                                    label=column_name + ' for Outcome = 0')
        outcome_1[column_name].plot(kind='density', ax=ax, subplots=True,
                                     sharex=False, color="green", legend=True,
                                     label=column_name + ' for Outcome = 1')
        ax.set_xlabel(column_name + ' values')
        ax.set_title(column_name + ' density')
        ax.grid('on')
    plt.show()
    fig.savefig('densities.png')

def accuracy(k, X_train, y_train, X_test, y_test):
    '''
    compute accuracy of the classification based on k values
    '''
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # predict the response
    pred = knn.predict(X_test)

    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)

def classify_and_plot(X, y):
    '''
    split data, fit, classify, plot and evaluate results
    '''
    # split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)
    best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1
    print('----------------------------------------------------------------------')
    print('For best accuracy use k = ', best_n_neighbours)
    print('----------------------------------------------------------------------')

    # init vars
    n_neighbors = best_n_neighbours
    h           = .02  # step size in the mesh

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])

    rcParams['figure.figsize'] = 5, 5
    for weights in ['uniform', 'distance']:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        fig = plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
        plt.show()
        fig.savefig(weights +'.png')

        # evaluate
        y_expected  = y_test
        y_predicted = clf.predict(X_test)

        # print results
        print('----------------------------------------------------------------------')
        print('Classification report')
        print('----------------------------------------------------------------------')
        print('\n', classification_report(y_expected, y_predicted))
        print('----------------------------------------------------------------------')
        print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
        print('----------------------------------------------------------------------')


# load your data
data  = pd.read_csv('diabetes.csv')
names = list(data.columns)

# plot correlation & densities
plot_correlation(data)
plot_densities(data)

# we only take the best two features and prepare them for the KNN classifier
rows_nbr = 30#data.shape[0]
X_prime  = np.array(data.iloc[:rows_nbr, [1,5]])
#X        = preprocessing.scale(X_prime)
X        = X_prime
y        = np.array(data.iloc[:rows_nbr, 8])

# classify, evaluate and plot results
classify_and_plot(X, y)
	import warnings
	import numpy as np
	import pandas as pd
	import seaborn as sns
	from pylab import rcParams
	import matplotlib.pyplot as plt
	from sklearn import preprocessing
	from sklearn import neighbors, datasets
	from matplotlib.colors import ListedColormap
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report
	# filter warnings
	warnings.filterwarnings("ignore")


	def plot_correlation(data):
	'''
	plot correlation's matrix to explore dependency between features
	'''
	# init figure size
	rcParams['figure.figsize'] = 15, 20
	fig = plt.figure()
	sns.heatmap(data.corr(), annot=True, fmt=".2f")
	plt.show()
	fig.savefig('corr.png')

	def plot_densities(data):
	'''
	Plot features densities depending on the outcome values
	'''
	# change fig size to fit all subplots beautifully
	rcParams['figure.figsize'] = 15, 20

	# separate data based on outcome values
	outcome_0 = data[data['Outcome'] == 0]
	outcome_1 = data[data['Outcome'] == 1]

	# init figure
	fig, axs = plt.subplots(8, 1)
	fig.suptitle('Features densities for different outcomes 0/1')
	plt.subplots_adjust(left = 0.25, right = 0.9, bottom = 0.1, top = 0.95,
	wspace = 0.2, hspace = 0.9)

	# plot densities for outcomes
	for column_name in names[:-1]:
	ax = axs[names.index(column_name)]
	#plt.subplot(4, 2, names.index(column_name) + 1)
	outcome_0[column_name].plot(kind='density', ax=ax, subplots=True,
	sharex=False, color="red", legend=True,
	label=column_name + ' for Outcome = 0')
	outcome_1[column_name].plot(kind='density', ax=ax, subplots=True,
	sharex=False, color="green", legend=True,
	label=column_name + ' for Outcome = 1')
	ax.set_xlabel(column_name + ' values')
	ax.set_title(column_name + ' density')
	ax.grid('on')
	plt.show()
	fig.savefig('densities.png')

	def accuracy(k, X_train, y_train, X_test, y_test):
	'''
	compute accuracy of the classification based on k values
	'''
	# instantiate learning model and fit data
	knn = KNeighborsClassifier(n_neighbors=k)
	knn.fit(X_train, y_train)

	# predict the response
	pred = knn.predict(X_test)

	# evaluate and return accuracy
	return accuracy_score(y_test, pred)

	def classify_and_plot(X, y):
	'''
	split data, fit, classify, plot and evaluate results
	'''
	# split data into training and testing set
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)
	best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, int(rows_nbr/2))])) + 1
	print('----------------------------------------------------------------------')
	print('For best accuracy use k = ', best_n_neighbours)
	print('----------------------------------------------------------------------')

	# init vars
	n_neighbors = best_n_neighbours
	h = .02 # step size in the mesh

	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#0000FF'])

	rcParams['figure.figsize'] = 5, 5
	for weights in ['uniform', 'distance']:
	# we create an instance of Neighbours Classifier and fit the data.
	clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
	clf.fit(X_train, y_train)

	# Plot the decision boundary. For that, we will assign a color to each
	# point in the mesh [x_min, x_max]x[y_min, y_max].
	x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
	y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	np.arange(y_min, y_max, h))
	Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)
	fig = plt.figure()
	plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

	# Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
	plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
	plt.xlim(xx.min(), xx.max())
	plt.ylim(yy.min(), yy.max())
	plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
	plt.show()
	fig.savefig(weights +'.png')

	# evaluate
	y_expected = y_test
	y_predicted = clf.predict(X_test)

	# print results
	print('----------------------------------------------------------------------')
	print('Classification report')
	print('----------------------------------------------------------------------')
	print('\n', classification_report(y_expected, y_predicted))
	print('----------------------------------------------------------------------')
	print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
	print('----------------------------------------------------------------------')


	# load your data
	data = pd.read_csv('diabetes.csv')
	names = list(data.columns)

	# plot correlation & densities
	plot_correlation(data)
	plot_densities(data)

	# we only take the best two features and prepare them for the KNN classifier
	rows_nbr = 30#data.shape[0]
	X_prime = np.array(data.iloc[:rows_nbr, [1,5]])
	#X = preprocessing.scale(X_prime)
	X = X_prime
	y = np.array(data.iloc[:rows_nbr, 8])

	# classify, evaluate and plot results
	classify_and_plot(X, y)