Skip to content

Instantly share code, notes, and snippets.

@hoenirvili
Last active January 17, 2018 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoenirvili/169b2361c21694c4999eef405df7a1a3 to your computer and use it in GitHub Desktop.
Save hoenirvili/169b2361c21694c4999eef405df7a1a3 to your computer and use it in GitHub Desktop.
kmeans
#!/usr/bin/env python3
import abc
import math
import random
class Point(metaclass=abc.ABCMeta):
def __init__(self, coordinate):
if not isinstance(coordinate, float) and not isinstance(
coordinate, int) and not isinstance(coordinate, list):
raise TypeError("Invalid type for point")
self._coordinate = coordinate
@abc.abstractmethod
def distance(self, point):
raise NotImplementedError(
"Users must define a distance method to be a point")
@abc.abstractproperty
def coordinates(self):
raise NotImplementedError(
"Users must define a coordinates method to be a point")
class OneDimensionalPoint(Point):
def distance(self, point):
if not isinstance(point, Point):
raise TypeError("Invalid type for distance")
point_coordinate = point.coordinates[0]
if self._coordinate > point_coordinate:
return self._coordinate - point_coordinate
return point_coordinate - self._coordinate
@property
def coordinates(self):
return [self._coordinate]
def __str__(self):
return str(self._coordinate)
def __repr__(self):
return self.__str__()
def __add__(self, point):
return OneDimensionalPoint(point.coordinates[0] + self._coordinate)
def __radd__(self, other):
if other == 0:
return self
return self.__add__(other)
def __len__(self):
return 1
class TwoDimensionalPoint(Point):
def distance(self, point):
if not isinstance(point, Point):
raise TypeError("Invalid type for distance")
x2, y2 = point.coordinates[0], point.coordinates[1]
x1, y1 = self.coordinates[0], self.coordinates[1]
return math.sqrt((math.pow((x1 - x2), 2) + math.pow((y1 - y2), 2)))
@property
def coordinates(self):
return self._coordinate
def __str__(self):
x = str(self.coordinates[0])
y = str(self.coordinates[1])
return str("[" + x + "," + y + "]")
def __repr__(self):
return self.__str__()
def __add__(self, point):
x2, y2 = point.coordinates[0], point.coordinates[1]
x1, y1 = self.coordinates[0], self.coordinates[1]
ux, uy = x1 + x2, y1 + y2
return TwoDimensionalPoint([ux, uy])
def __radd__(self, other):
if other == 0:
return self
return self.__add__(other)
def __len__(self):
return 2
def onePoints(arr):
if not isinstance(arr, list):
raise TypeError("Arr must be a list")
if len(arr) < 2:
raise ValueError("Too few values")
points = []
for a in arr:
point = OneDimensionalPoint(a)
points.append(point)
return points
def twoPoints(arr):
if not isinstance(arr, list):
raise TypeError("Arr must be a list")
if len(arr) < 2:
raise ValueError("Invalid row length")
for a in arr:
if not isinstance(a, list):
raise TypeError("Elements of arr must also be a list")
if len(a) < 2:
raise ValueError("Invalid col length")
points = []
for col in arr:
row = TwoDimensionalPoint(col)
points.append(row)
return points
def best(centroids, point):
d = {}
for centroid in centroids:
d[centroid] = point.distance(centroid)
return min(d, key=d.get)
def compose_clusters(points, centroids):
clusters = {}
for centroid in centroids:
clusters[centroid] = []
for point in points:
centroid = best(centroids, point)
clusters[centroid] += [point.coordinates]
return clusters
def sum_coordinates(points, dimension):
if dimension == 1:
r = sum(point[0] for point in points)
return (r, r)
r1, r2 = 0, 0
for point in points:
r1 += point[0]
r2 += point[1]
return (r1, r2)
def recompute_centroids(clusters, dimension):
centroids = []
for key, points in clusters.items():
if points == []:
centroids += [key]
continue
ux, uy = sum_coordinates(points, dimension)
if dimension == 1:
u = ux / len(points)
centroids.append(OneDimensionalPoint(u))
elif dimension == 2:
ux, uy = ux / len(points), uy / len(points)
centroids.append(TwoDimensionalPoint([ux, uy]))
return centroids
def compare_centroids(p1, p2):
if len(p1) != len(p2):
raise ValueError("Can't compare these two centroids")
for i, _ in enumerate(p1):
if set(p1[i].coordinates) != set(p2[i].coordinates):
return False
return True
def jcriterion(clusters):
s = 0
for key, values in clusters.items():
if values == []:
continue
centroid = key
for point in values:
if len(point) == 2:
x1 = point[0]
y1 = point[1]
x2 = centroid.coordinates[0]
y2 = centroid.coordinates[1]
x = math.pow((x1 - x2), 2) + math.pow((y1 - y2), 2)
s += x
else:
x = OneDimensionalPoint(point[0])
d = centroid.distance(x)
s += math.pow(d, 2)
return s
def kmeans(points, centroids, k, iterations=-1):
if k < 2:
raise ValueError("Invalid number of clusters")
if centroids is None:
raise ValueError("Need valid centroids")
dimension = len(points[0])
convergence = False
i = 0
t = 0
while not convergence:
if iterations != -1:
i += 1
if i == iterations:
break
clusters = compose_clusters(points, centroids)
j = jcriterion(clusters)
print("Step: {:d}, value: {:f}".format(t, j))
newCentroids = recompute_centroids(clusters, dimension)
convergence = compare_centroids(centroids, newCentroids)
centroids = newCentroids
t += 1
return clusters
def main():
coordinates = [-9, -8, -7, -6, -5, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9]
points = onePoints(coordinates)
centroids = onePoints([-20, -10])
clusters = kmeans(points, centroids, len(centroids))
keys = list(clusters.keys())
print("Centroid = ", keys[0], "Cluster = ", clusters[keys[0]])
print("Centroid = ", keys[1], "Cluster = ", clusters[keys[1]])
print()
coordinates = [[1, 0], [-1, 0], [0, 1], [3, 0], [3, 1]]
points = twoPoints(coordinates)
centroids = twoPoints([[-1, 0], [3, 1]])
clusters = kmeans(points, centroids, 2)
keys = list(clusters.keys())
print("Centroid = ", keys[0], "Cluster = ", clusters[keys[0]])
print("Centroid = ", keys[1], "Cluster = ", clusters[keys[1]])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment