Skip to content

Instantly share code, notes, and snippets.

@DavidSanf0rd
Created December 14, 2017 15:03
Show Gist options
  • Save DavidSanf0rd/eb00ece882edc0ac51f515bf5a2d1bb0 to your computer and use it in GitHub Desktop.
Save DavidSanf0rd/eb00ece882edc0ac51f515bf5a2d1bb0 to your computer and use it in GitHub Desktop.
ic: KNN e DMC
import sklearn
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import numpy as np
from sklearn import datasets
from sklearn.neighbors import NearestCentroid
n_neighbors = 15
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
cmap_light =[[0, '#FFAAAA'], [0.5, '#AAFFAA'], [1, '#AAAAFF']]
cmap_bold = [[0, '#FF0000'], [0.5, '#00FF00'], [1, '#0000FF']]
data = []
titles = []
i = 0
for shrinkage in [None, .2]:
clf = NearestCentroid(shrink_threshold=shrinkage)
clf.fit(X, y)
y_pred = clf.predict(X)
print(shrinkage, np.mean(y == y_pred))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
x_ = np.arange(x_min, x_max, h)
y_ = np.arange(y_min, y_max, h)
xx, yy = np.meshgrid(x_, y_)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
data.append([])
p1 = go.Heatmap(x=x_, y=y_, z=Z,
showscale=False,
colorscale=cmap_light)
p2 = go.Scatter(x=X[:, 0], y=X[:, 1],
mode='markers',
marker=dict(color=X[:, 0],
colorscale=cmap_bold,
line=dict(color='black', width=1)))
data[i].append(p1)
data[i].append(p2)
titles.append("3-Class classification (shrink_threshold=%r)"
% shrinkage)
i+=1
fig = tools.make_subplots(rows=1, cols=2,
subplot_titles=tuple(titles),
print_grid=False)
for i in range(0, len(data)):
for j in range(0, len(data[i])):
fig.append_trace(data[i][j], 1, i+1)
fig['layout'].update(height=700, hovermode='closest',
showlegend=False)
py.iplot(fig)
import math
from csv import reader
from random import random, randint
import operator
def load_and_parse(self):
with open(self.filename, 'rb') as data_set:
lines = reader(data_set)
data_set = list(lines)
for x in range(len(data_set) - 1):
for y in range(0, 4, 1):
data_set[x][y] = float(data_set[x][y])
if random() < self.ratio:
self.training_set.append(data_set[x])
else:
self.test_set.append(data_set[x])
"""
Returns a random test point for future prediction
"""
def random_test_point(self):
return self.test_set[randint(0, len(self.test_set) - 1)]
"""
Calculates similarity using Euclidean distance
"""
@staticmethod
def distance(a, b, size):
d = 0
for x in range(size):
d += pow((a[x] - b[x]), 2)
return math.sqrt(d)
"""
Finds the k most similar instance to the given test point
"""
def get_neighbors(self, test_point):
distances = []
size = len(test_point) - 1
for x in range(len(self.training_set)):
dist = self.distance(test_point, self.training_set[x], size)
distances.append((self.training_set[x], dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(self.k):
neighbors.append(distances[x][0])
return neighbors
"""
Returns the class to which a test point belongs to based on its neighbors
"""
@staticmethod
def get_class(neighbors):
votes = {}
for x in range(len(neighbors)):
klass = neighbors[x][-1]
if klass in votes:
votes[klass] += 1
else:
votes[klass] = 1
sorted_votes = sorted(votes.iteritems(), key=operator.itemgetter(1), reverse=True)
return sorted_votes[0][0]
"""
Predicts the class of a given test point
"""
def predict(self, test_point):
neighbors = self.get_neighbors(test_point)
prediction = self.get_class(neighbors)
return prediction
knn = KNN(k=3, filename='iris.csv', ratio=0.7)
knn.load_and_parse()
test_point = knn.random_test_point()
print test_point
print knn.predict(test_point)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment