leonaburime/kNN.py

## kNN.py
from __future__ import division
from pprint import pprint
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

#References - http://saravananthirumuruganathan.wordpress.com/2010/05/17/a-detailed-introduction-to-k-nearest-neighbor-knn-algorithm/

#http://www.saedsayad.com/k_nearest_neighbors.htm
input  = [
#Age #Loan
[25, 40000],
[35, 60000],
[45, 80000],
[20, 20000],
[35, 120000],
[52, 18000],
[23, 95000],
[40, 62000],
[60, 100000],
[48, 220000],
[33, 150000],

]

#Default
output  = [
	'N',
	'N',
	'N',
	'N',
	'N',
	'N',
	'Y',
	'Y',
	'Y',
	'Y',
	'Y'
]

predict = [48, 142000]


class kNN:
	k = 2

	def __init__(self):
		self.k = 2


	#SKLearn Module's Implementation of k-Nearest Neighbors
	def sklearnNearestNeighbors(self, input, output, norm_predict, predict, k=k):

		self.skNeighbors = KNeighborsClassifier(n_neighbors=k)

		self.skNeighbors.fit(input, output )

		print "\nSKLearn: Minimum distances with their labels"
		min_values = self.skNeighbors.kneighbors( norm_predict )
		pprint (  zip( min_values[0][0].tolist() , list( output[min_values[1].tolist()] ) ) )

		prediction_list = self.skNeighbors.predict(norm_predict)
		print "\nSKLearn Prediction of %s with k = %d is %s \n" % ( predict, k, prediction_list[0] )


	def solveKNN(self, input=input, output=output, predict=predict, k=k):

		#Lets numpify our arrays and make them floats to give them access to more functions
		np_input, np_predict, np_output = np.array(input).astype(float), np.array(predict).astype(float), np.array(output)

		#Lets get the max and min values of each columns
		max_value, min_value = np_input.max(axis=0), np_input.min(axis=0)

		#Now we can standardize our arrays
		norm_input =  (np_input - min_value)/(max_value - min_value)
		norm_predict = (predict - min_value)/(max_value - min_value)

		#print np.around( norm_predict, decimals = 3)

		#Create a list of Euclidean distances via np.linalg.norm using our normalized prediction vs our normalized input
		distances = np.array( [np.linalg.norm( norm_predict-v ) for v in norm_input]  )

		pprint ( list( distances ))
		#Lets find the indices of the minimum values as well as the minimum values
		min_indices , min_values = distances.argsort()[:k], sorted( distances )[:k]

		#Get the k nearest values
		nearest = list( np_output[min_indices] )

		print "\nMinimum distances with their labels"


		print "\nPrediction of %s with k=%d is %s " % ( predict,  k, max(nearest, key=nearest.count) )

		self.sklearnNearestNeighbors( norm_input, np_output, norm_predict, predict, k)


#Comments for tutorials
"""

Steps to run k-Nearest Neighbor
1. import kNN
2. knn = kNN.kNN()
3. knn.solveKNN()

Normalized input should like
input = [
  #Age    #Loan
 [ 0.125  0.109]
 [ 0.375  0.208]
 [ 0.625  0.307]
 [ 0.     0.01 ]
 [ 0.375  0.505]
 [ 0.8    0.   ]
 [ 0.075  0.381]
 [ 0.5    0.218]
 [ 1.     0.406]
 [ 0.7    1.   ]
 [ 0.325  0.653]
 ]


The result set you get should look like this using predict = [48, 142000]

distances = [

 (0.3159611532261599, 'N'),
 (0.3427631575852688, 'N'),
 (0.36500829549319175, 'Y'),
 (0.37708549916305584, 'Y'),
 (0.3861386138613863, 'Y'),
 (0.44367484479639735, 'Y'),
 (0.5200122747640707, 'N'),
 (0.6219532147935409, 'N'),
 (0.6669046778427162, 'Y'),
 (0.7652450603896376, 'N'),
 (0.9245367272230537, 'N')

 ]


"""
	from __future__ import division
	from pprint import pprint
	import numpy as np
	import pandas as pd
	from sklearn.neighbors import KNeighborsClassifier

	#References - http://saravananthirumuruganathan.wordpress.com/2010/05/17/a-detailed-introduction-to-k-nearest-neighbor-knn-algorithm/

	#http://www.saedsayad.com/k_nearest_neighbors.htm
	input = [
	#Age #Loan
	[25, 40000],
	[35, 60000],
	[45, 80000],
	[20, 20000],
	[35, 120000],
	[52, 18000],
	[23, 95000],
	[40, 62000],
	[60, 100000],
	[48, 220000],
	[33, 150000],

	]

	#Default
	output = [
	'N',
	'N',
	'N',
	'N',
	'N',
	'N',
	'Y',
	'Y',
	'Y',
	'Y',
	'Y'
	]

	predict = [48, 142000]


	class kNN:
	k = 2

	def __init__(self):
	self.k = 2


	#SKLearn Module's Implementation of k-Nearest Neighbors
	def sklearnNearestNeighbors(self, input, output, norm_predict, predict, k=k):

	self.skNeighbors = KNeighborsClassifier(n_neighbors=k)

	self.skNeighbors.fit(input, output )

	print "\nSKLearn: Minimum distances with their labels"
	min_values = self.skNeighbors.kneighbors( norm_predict )
	pprint ( zip( min_values[0][0].tolist() , list( output[min_values[1].tolist()] ) ) )

	prediction_list = self.skNeighbors.predict(norm_predict)
	print "\nSKLearn Prediction of %s with k = %d is %s \n" % ( predict, k, prediction_list[0] )



	def solveKNN(self, input=input, output=output, predict=predict, k=k):

	#Lets numpify our arrays and make them floats to give them access to more functions
	np_input, np_predict, np_output = np.array(input).astype(float), np.array(predict).astype(float), np.array(output)

	#Lets get the max and min values of each columns
	max_value, min_value = np_input.max(axis=0), np_input.min(axis=0)

	#Now we can standardize our arrays
	norm_input = (np_input - min_value)/(max_value - min_value)
	norm_predict = (predict - min_value)/(max_value - min_value)

	#print np.around( norm_predict, decimals = 3)

	#Create a list of Euclidean distances via np.linalg.norm using our normalized prediction vs our normalized input
	distances = np.array( [np.linalg.norm( norm_predict-v ) for v in norm_input] )

	pprint ( list( distances ))
	#Lets find the indices of the minimum values as well as the minimum values
	min_indices , min_values = distances.argsort()[:k], sorted( distances )[:k]

	#Get the k nearest values
	nearest = list( np_output[min_indices] )

	print "\nMinimum distances with their labels"


	print "\nPrediction of %s with k=%d is %s " % ( predict, k, max(nearest, key=nearest.count) )

	self.sklearnNearestNeighbors( norm_input, np_output, norm_predict, predict, k)





	#Comments for tutorials
	"""

	Steps to run k-Nearest Neighbor
	1. import kNN
	2. knn = kNN.kNN()
	3. knn.solveKNN()

	Normalized input should like
	input = [
	#Age #Loan
	[ 0.125 0.109]
	[ 0.375 0.208]
	[ 0.625 0.307]
	[ 0. 0.01 ]
	[ 0.375 0.505]
	[ 0.8 0. ]
	[ 0.075 0.381]
	[ 0.5 0.218]
	[ 1. 0.406]
	[ 0.7 1. ]
	[ 0.325 0.653]
	]


	The result set you get should look like this using predict = [48, 142000]

	distances = [

	(0.3159611532261599, 'N'),
	(0.3427631575852688, 'N'),
	(0.36500829549319175, 'Y'),
	(0.37708549916305584, 'Y'),
	(0.3861386138613863, 'Y'),
	(0.44367484479639735, 'Y'),
	(0.5200122747640707, 'N'),
	(0.6219532147935409, 'N'),
	(0.6669046778427162, 'Y'),
	(0.7652450603896376, 'N'),
	(0.9245367272230537, 'N')

	]


	"""