Skip to content

Instantly share code, notes, and snippets.

@Lukas0025
Last active May 17, 2021 20:49
Show Gist options
  • Save Lukas0025/88d6df58fe31b64d2d36ac2935bef3b7 to your computer and use it in GitHub Desktop.
Save Lukas0025/88d6df58fe31b64d2d36ac2935bef3b7 to your computer and use it in GitHub Desktop.
Python k-means clustering
##
# k-mean clustering algoritm
# @autor Lukáš Plevač <xpleva07@vutbr.cz>
# @date 5.5.2021
# CC0 license - No Rights Reserved.
#
import numpy as np
import os
##
# Calcuate distance between two points (euclidean)
# @param point1 - np.array of point coords (same shape as of point2)
# @param point2 - np.array of point coords (same shape as of point1)
# @return distance in float
def euclidean_distance(point1, point2):
return np.sqrt(np.sum((point1 - point2) ** 2))
##
# calculate new centroids by clusters
# @param clusters np.array of clusters with points
# @return np.array of ceteroids of clusters
def calc_centroids(clusters):
centroids = []
for cluster in clusters:
point_len = len(cluster[0])
cur_item = []
for i in range(point_len):
cur_item.append(np.mean([point[i] for point in cluster]))
centroids.append(cur_item)
return centroids
##
# One iteration of k means clustering
# @param data_points - np.array of point coords to cluster
# @param centroids - np.array of centreal point coords (for fisrt iteration use random)
# @return np.array of new centroids
def k_means_iter(data_points, centroids, debug = False):
clusters = []
debug_clusters = []
for i in range(len(centroids)):
clusters.append([])
debug_clusters.append([])
for point_i in range(len(data_points)):
distance = []
# clac distances to centroids
for centroid in centroids:
distance.append(euclidean_distance(data_points[point_i], centroid))
# asign to cluster
clusters[distance.index(min(distance))].append(data_points[point_i])
if debug:
print("point distance: " + str(distance))
debug_clusters[distance.index(min(distance))].append(point_i + 1)
if debug:
print("\n\nclusters: " + str(clusters))
print("\n\nclusters indexes (from 1): " + str(debug_clusters))
# Calc new centroids
return calc_centroids(clusters)
##
# Exmaple use
if __name__ == "__main__":
data_points = np.array([[ 0,-1,-2],[-3,-1,-3],[ 1,-3, 2],[-2,-2, 2],[ 1, 2,-4],[ 0,-4, 3],[ 1, 0,-3],[-3, 0, 0],[-2, 2,-4],[-2, 4, 3],[ 3,-2, 4],[ 2,-5,-4]])
centroids = np.array([[-1, 1, -4], [-1, 6, -4], [5, 0, -3]])
for iteration in range(4):
print("\n\n---------------------------------iteration: {}".format(iteration))
centroids = k_means_iter(data_points, centroids, debug=True)
print("\n\nnew centroids: {}".format(centroids))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment