Last active
August 29, 2015 14:03
-
-
Save leonaburime/ef94c529cc9b7aa2de89 to your computer and use it in GitHub Desktop.
k-Nearest Neighbors Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from pprint import pprint | |
import numpy as np | |
import pandas as pd | |
from sklearn.neighbors import KNeighborsClassifier | |
#References - http://saravananthirumuruganathan.wordpress.com/2010/05/17/a-detailed-introduction-to-k-nearest-neighbor-knn-algorithm/ | |
#http://www.saedsayad.com/k_nearest_neighbors.htm | |
input = [ | |
#Age #Loan | |
[25, 40000], | |
[35, 60000], | |
[45, 80000], | |
[20, 20000], | |
[35, 120000], | |
[52, 18000], | |
[23, 95000], | |
[40, 62000], | |
[60, 100000], | |
[48, 220000], | |
[33, 150000], | |
] | |
#Default | |
output = [ | |
'N', | |
'N', | |
'N', | |
'N', | |
'N', | |
'N', | |
'Y', | |
'Y', | |
'Y', | |
'Y', | |
'Y' | |
] | |
predict = [48, 142000] | |
class kNN: | |
k = 2 | |
def __init__(self): | |
self.k = 2 | |
#SKLearn Module's Implementation of k-Nearest Neighbors | |
def sklearnNearestNeighbors(self, input, output, norm_predict, predict, k=k): | |
self.skNeighbors = KNeighborsClassifier(n_neighbors=k) | |
self.skNeighbors.fit(input, output ) | |
print "\nSKLearn: Minimum distances with their labels" | |
min_values = self.skNeighbors.kneighbors( norm_predict ) | |
pprint ( zip( min_values[0][0].tolist() , list( output[min_values[1].tolist()] ) ) ) | |
prediction_list = self.skNeighbors.predict(norm_predict) | |
print "\nSKLearn Prediction of %s with k = %d is %s \n" % ( predict, k, prediction_list[0] ) | |
def solveKNN(self, input=input, output=output, predict=predict, k=k): | |
#Lets numpify our arrays and make them floats to give them access to more functions | |
np_input, np_predict, np_output = np.array(input).astype(float), np.array(predict).astype(float), np.array(output) | |
#Lets get the max and min values of each columns | |
max_value, min_value = np_input.max(axis=0), np_input.min(axis=0) | |
#Now we can standardize our arrays | |
norm_input = (np_input - min_value)/(max_value - min_value) | |
norm_predict = (predict - min_value)/(max_value - min_value) | |
#print np.around( norm_predict, decimals = 3) | |
#Create a list of Euclidean distances via np.linalg.norm using our normalized prediction vs our normalized input | |
distances = np.array( [np.linalg.norm( norm_predict-v ) for v in norm_input] ) | |
pprint ( list( distances )) | |
#Lets find the indices of the minimum values as well as the minimum values | |
min_indices , min_values = distances.argsort()[:k], sorted( distances )[:k] | |
#Get the k nearest values | |
nearest = list( np_output[min_indices] ) | |
print "\nMinimum distances with their labels" | |
print "\nPrediction of %s with k=%d is %s " % ( predict, k, max(nearest, key=nearest.count) ) | |
self.sklearnNearestNeighbors( norm_input, np_output, norm_predict, predict, k) | |
#Comments for tutorials | |
""" | |
Steps to run k-Nearest Neighbor | |
1. import kNN | |
2. knn = kNN.kNN() | |
3. knn.solveKNN() | |
Normalized input should like | |
input = [ | |
#Age #Loan | |
[ 0.125 0.109] | |
[ 0.375 0.208] | |
[ 0.625 0.307] | |
[ 0. 0.01 ] | |
[ 0.375 0.505] | |
[ 0.8 0. ] | |
[ 0.075 0.381] | |
[ 0.5 0.218] | |
[ 1. 0.406] | |
[ 0.7 1. ] | |
[ 0.325 0.653] | |
] | |
The result set you get should look like this using predict = [48, 142000] | |
distances = [ | |
(0.3159611532261599, 'N'), | |
(0.3427631575852688, 'N'), | |
(0.36500829549319175, 'Y'), | |
(0.37708549916305584, 'Y'), | |
(0.3861386138613863, 'Y'), | |
(0.44367484479639735, 'Y'), | |
(0.5200122747640707, 'N'), | |
(0.6219532147935409, 'N'), | |
(0.6669046778427162, 'Y'), | |
(0.7652450603896376, 'N'), | |
(0.9245367272230537, 'N') | |
] | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment