Last active
March 27, 2020 23:33
-
-
Save fernandojunior/2bf446a9496fb68152ce3822b43c524b to your computer and use it in GitHub Desktop.
Perform data clustering using any distance or centroid metric as euclidian or cosine distance and median or mean centroid
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# adapted from https://pythonprogramming.net/k-means-from-scratch-machine-learning-tutorial/ | |
from sklearn import datasets | |
import numpy as np | |
from scipy import spatial | |
euclidian_distance = lambda a, b: np.linalg.norm(a - b) | |
cosine_distance = lambda a, b: spatial.distance.cosine(a, b) | |
class KCustom: | |
def __init__(self, k=2, tol=0.001, max_iter=300, centroid_func=np.average, distance_func=euclidian_distance): | |
self.k = k | |
self.tol = tol | |
self.max_iter = max_iter | |
self.centroid_fun = centroid_func | |
self.distance_func = distance_func | |
def fit(self,data): | |
self.centroids = {} | |
for i in range(self.k): | |
self.centroids[i] = data[i] | |
for i in range(self.max_iter): | |
self.classifications = {} | |
for i in range(self.k): | |
self.classifications[i] = [] | |
for featureset in data: | |
distances = [self.distance_func(featureset, self.centroids[centroid]) for centroid in self.centroids] | |
classification = distances.index(min(distances)) | |
self.classifications[classification].append(featureset) | |
prev_centroids = dict(self.centroids) | |
for classification in self.classifications: | |
self.centroids[classification] = self.centroid_fun(self.classifications[classification],axis=0) | |
self.bsse = {} | |
self.wsse = 0.0 | |
for k, v in self.classifications.items(): | |
v = np.vstack(v) | |
bsse = np.sum(np.var(v) * len(v) - 1) | |
self.wsse += bsse | |
self.bsse[k] = bsse | |
# test distance methods | |
a = np.array([0.2,0.4,0.4]) | |
b = np.array([0.2,0.3,0.4]) | |
cos_sim = cosine_distance(a, b) | |
eucl_sim = euclidian_distance(a, b) | |
print(cos_sim, eucl_sim) | |
# cluster iris data set | |
iris = datasets.load_iris() | |
#model = KCustom(k=10, centroid_func=np.average, distance_func=euclidian_distance) | |
#model = KCustom(k=25, centroid_func=np.median, distance_func=cosine_distance) | |
model = KCustom(k=25, centroid_func=np.median, distance_func=euclidian_distance) | |
model.fit(iris.data) | |
# Within Set Sum of Squared Errors (similar to sklearn.cluster.KMeans#inertia_) | |
model.wsse | |
# Center points (similar to sklearn.cluster.KMeans#cluster_centers_) | |
model.centroids | |
# Cluster dict (key: label, value: data points) | |
model.classifications |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment