Lukas0025/k-means.py

## k-means.py
##
# k-mean clustering algoritm
# @autor Lukáš Plevač <xpleva07@vutbr.cz>
# @date 5.5.2021
# CC0 license - No Rights Reserved.
#

import numpy as np
import os

##
# Calcuate distance between two points (euclidean)
# @param  point1 - np.array of point coords (same shape as of point2)
# @param  point2 - np.array of point coords (same shape as of point1)
# @return distance in float
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

##
# calculate new centroids by clusters
# @param clusters np.array of clusters with points
# @return np.array of ceteroids of clusters
def calc_centroids(clusters):
    centroids = []
    for cluster in clusters:
        point_len = len(cluster[0])
        cur_item  = []

        for i in range(point_len):
            cur_item.append(np.mean([point[i] for point in cluster]))

        centroids.append(cur_item)

    return centroids

##
# One iteration of k means clustering
# @param  data_points - np.array of point coords to cluster
# @param  centroids   - np.array of centreal point coords (for fisrt iteration use random)
# @return np.array of new centroids
def k_means_iter(data_points, centroids, debug = False):
    clusters = []
    debug_clusters = []
    for i in range(len(centroids)):
        clusters.append([])
        debug_clusters.append([])

    for point_i in range(len(data_points)):
        distance = []
        # clac distances to centroids
        for centroid in centroids:
            distance.append(euclidean_distance(data_points[point_i], centroid))

        # asign to cluster
        clusters[distance.index(min(distance))].append(data_points[point_i])

        if debug:
            print("point distance: " + str(distance))
            debug_clusters[distance.index(min(distance))].append(point_i + 1)

    if debug:
        print("\n\nclusters: " + str(clusters))
        print("\n\nclusters indexes (from 1): " + str(debug_clusters))

    # Calc new centroids
    return calc_centroids(clusters)

##
# Exmaple use
if __name__ == "__main__":
    data_points = np.array([[ 0,-1,-2],[-3,-1,-3],[ 1,-3, 2],[-2,-2, 2],[ 1, 2,-4],[ 0,-4, 3],[ 1, 0,-3],[-3, 0, 0],[-2, 2,-4],[-2, 4, 3],[ 3,-2, 4],[ 2,-5,-4]])
    centroids = np.array([[-1, 1, -4], [-1, 6, -4], [5, 0, -3]])

    for iteration in range(4):
        print("\n\n---------------------------------iteration: {}".format(iteration))
        centroids = k_means_iter(data_points, centroids, debug=True)
        print("\n\nnew centroids: {}".format(centroids))
	##
	# k-mean clustering algoritm
	# @autor Lukáš Plevač <xpleva07@vutbr.cz>
	# @date 5.5.2021
	# CC0 license - No Rights Reserved.
	#

	import numpy as np
	import os

	##
	# Calcuate distance between two points (euclidean)
	# @param point1 - np.array of point coords (same shape as of point2)
	# @param point2 - np.array of point coords (same shape as of point1)
	# @return distance in float
	def euclidean_distance(point1, point2):
	return np.sqrt(np.sum((point1 - point2) ** 2))

	##
	# calculate new centroids by clusters
	# @param clusters np.array of clusters with points
	# @return np.array of ceteroids of clusters
	def calc_centroids(clusters):
	centroids = []
	for cluster in clusters:
	point_len = len(cluster[0])
	cur_item = []

	for i in range(point_len):
	cur_item.append(np.mean([point[i] for point in cluster]))

	centroids.append(cur_item)

	return centroids

	##
	# One iteration of k means clustering
	# @param data_points - np.array of point coords to cluster
	# @param centroids - np.array of centreal point coords (for fisrt iteration use random)
	# @return np.array of new centroids
	def k_means_iter(data_points, centroids, debug = False):
	clusters = []
	debug_clusters = []
	for i in range(len(centroids)):
	clusters.append([])
	debug_clusters.append([])

	for point_i in range(len(data_points)):
	distance = []
	# clac distances to centroids
	for centroid in centroids:
	distance.append(euclidean_distance(data_points[point_i], centroid))

	# asign to cluster
	clusters[distance.index(min(distance))].append(data_points[point_i])

	if debug:
	print("point distance: " + str(distance))
	debug_clusters[distance.index(min(distance))].append(point_i + 1)

	if debug:
	print("\n\nclusters: " + str(clusters))
	print("\n\nclusters indexes (from 1): " + str(debug_clusters))

	# Calc new centroids
	return calc_centroids(clusters)

	##
	# Exmaple use
	if __name__ == "__main__":
	data_points = np.array([[ 0,-1,-2],[-3,-1,-3],[ 1,-3, 2],[-2,-2, 2],[ 1, 2,-4],[ 0,-4, 3],[ 1, 0,-3],[-3, 0, 0],[-2, 2,-4],[-2, 4, 3],[ 3,-2, 4],[ 2,-5,-4]])
	centroids = np.array([[-1, 1, -4], [-1, 6, -4], [5, 0, -3]])

	for iteration in range(4):
	print("\n\n---------------------------------iteration: {}".format(iteration))
	centroids = k_means_iter(data_points, centroids, debug=True)
	print("\n\nnew centroids: {}".format(centroids))