Skip to content

Instantly share code, notes, and snippets.

@fernandojunior
Last active March 27, 2020 23:33
Show Gist options
  • Save fernandojunior/2bf446a9496fb68152ce3822b43c524b to your computer and use it in GitHub Desktop.
Save fernandojunior/2bf446a9496fb68152ce3822b43c524b to your computer and use it in GitHub Desktop.
Perform data clustering using any distance or centroid metric as euclidian or cosine distance and median or mean centroid
# adapted from https://pythonprogramming.net/k-means-from-scratch-machine-learning-tutorial/
from sklearn import datasets
import numpy as np
from scipy import spatial
euclidian_distance = lambda a, b: np.linalg.norm(a - b)
cosine_distance = lambda a, b: spatial.distance.cosine(a, b)
class KCustom:
def __init__(self, k=2, tol=0.001, max_iter=300, centroid_func=np.average, distance_func=euclidian_distance):
self.k = k
self.tol = tol
self.max_iter = max_iter
self.centroid_fun = centroid_func
self.distance_func = distance_func
def fit(self,data):
self.centroids = {}
for i in range(self.k):
self.centroids[i] = data[i]
for i in range(self.max_iter):
self.classifications = {}
for i in range(self.k):
self.classifications[i] = []
for featureset in data:
distances = [self.distance_func(featureset, self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classifications[classification].append(featureset)
prev_centroids = dict(self.centroids)
for classification in self.classifications:
self.centroids[classification] = self.centroid_fun(self.classifications[classification],axis=0)
self.bsse = {}
self.wsse = 0.0
for k, v in self.classifications.items():
v = np.vstack(v)
bsse = np.sum(np.var(v) * len(v) - 1)
self.wsse += bsse
self.bsse[k] = bsse
# test distance methods
a = np.array([0.2,0.4,0.4])
b = np.array([0.2,0.3,0.4])
cos_sim = cosine_distance(a, b)
eucl_sim = euclidian_distance(a, b)
print(cos_sim, eucl_sim)
# cluster iris data set
iris = datasets.load_iris()
#model = KCustom(k=10, centroid_func=np.average, distance_func=euclidian_distance)
#model = KCustom(k=25, centroid_func=np.median, distance_func=cosine_distance)
model = KCustom(k=25, centroid_func=np.median, distance_func=euclidian_distance)
model.fit(iris.data)
# Within Set Sum of Squared Errors (similar to sklearn.cluster.KMeans#inertia_)
model.wsse
# Center points (similar to sklearn.cluster.KMeans#cluster_centers_)
model.centroids
# Cluster dict (key: label, value: data points)
model.classifications
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment