vitasiku/KMeansClustering.py

## KMeansClustering.py
class Clustering:
    """
    An instance of the Clustering is a solution i.e. a particular partitioning of the (heterogeneous) data set into
    homogeneous subsets. For Centroid based clustering algorithms this involves looking at each pattern and assigning
    it to it's nearest centroid. This is done by calculating the distance between each pattern and every centroid and
    selecting the one with the smallest distance. Here we use are using fractional distance with the default parameters.

    :param d: dimensionality of the input patterns
    :param k: the pre-specified number of clusters & centroids
    :param z: the patterns in the data set
    :param min: the minimum distance (required to prevent division by zero)
    """

    def __init__(self, d, k, z, min):
        # print("Initializing solution ...")
        """
        Initializes a Clustering object with the specified parameters
        :param d: dimensionality of the input patterns
        :param k: the pre-specified number of clusters & centroids
        :param z: the patterns in the data set
        :param min: the minimum distance (required to prevent division by zero)
        """
        self.dimensionality = d
        self.num_clusters = k
        self.patterns = z
        self.solution = []
        for i in range(len(z)):
            self.solution.append(0)
        self.centroids = np.random.rand(k, d)
        self.e = min

    def re_init(self):
        """
        A method for reinitializing the solution
        """
        self.centroids = None
        self.centroids = np.random.rand(self.num_clusters, self.dimensionality)

    def assign_patterns(self):
        """
        This method iterates over all patterns and calculates the distances between that pattern and every centroid.
        These value are stored in [distances]. The assign_pattern method is then used to find the centroid with the
        smallest distance and update the 'label' i.e. centroid which the pattern is associated.
        """
        s = Similarity(self.e)
        # for each pattern
        for i in range(len(self.patterns)):
            # for each centroid
            distances = []
            for j in range(self.num_clusters):
                # calculate the distances
                distances.append(s.fractional_distance(self.centroids[j], self.patterns[i]))
            # assign the pattern to a cluster
            self.assign_pattern(distances, i)

    def assign_pattern(self, distances, index):
        """
        This method updates the label i.e. centroid index \in (0, k-1) to which pattern z(index) belongs
        :param distances: distances to each centroid
        :param index: the index of the pattern we are assigning in z
        """
        self.solution[index] = 0
        smallest = distances[self.solution[index]]
        for i in range(len(distances)):
            if distances[i] < smallest:
                smallest = distances[i]
                self.solution[index] = i

    def update_centroids(self, s=1.0):
        """
        This method implements the mean-shift heuristic used by the K-means clustering algorithm. This heuristic
        updates the value of each centroid with the average value at each dimension of the patterns assigned to it.
        :param s: this is the scaling factor i.e. how much we want to diminish the movement of the centroids by
        """
        # Step 1 - initialize a variable to store the sum at each dimension of the patterns assigned to each centroid
        centroids_sum = []
        for i in range(self.num_clusters):
            centroids_sum.append([])
            for j in range(self.dimensionality):
                centroids_sum[i].append(0.0)
        # Step 2 - initialize a variable to store the count of patterns assigned to each centroid
        centroids_count = []
        for i in range(self.num_clusters):
            centroids_count.append(0.0)
        # Step 3 - Update the value of centroids_sum and centroids_count for step 4
        for i in range(len(self.solution)):
            for j in range(self.dimensionality):
                centroids_sum[self.solution[i]][j] += self.patterns[i][j]
            centroids_count[self.solution[i]] += 1
        # Step 4 - compute the averages (total / count) for each dimension for each centroid
        centroids_average = []
        for i in range(self.num_clusters):
            centroids_average.append([])
            for j in range(self.dimensionality):
                if centroids_count[i] > 0:
                    centroids_average[i].append(centroids_sum[i][j] / max(1.0, centroids_count[i]))
                else:
                    centroids_average[i].append(random.random())
        # Step 5 - set  each dimension of each centroid to the average of it's clusters values at that dimension
        for i in range(self.num_clusters):
            if s == 1.0:
                self.centroids[i] = None
                self.centroids[i] = centroids_average[i]
            else:
                for j in range(len(self.centroids[i])):
                    self.centroids[i][j] += (centroids_average[i][j] - self.centroids[i][j]) * s

    def k_means_clustering(self, n, s=1.0):
        """
        This method performs the K-means clustering algorithm on the data for n iterations. This involves updating the
        centroids using the mean-shift heuristic n-times and reassigning the patterns to their closest centroids.
        :param n: number of iterations to complete
        :param s: the scaling factor to use when updating the centroids
        pick on which has a better solution (according to some measure of cluster quality)
        """

        for i in range(n):
            self.assign_patterns()
            self.update_centroids(s)

    def print_solution(self, labels):
        """
        Prints out the clustering i.e. which patterns are assigned to which centroids. This can be cross-referenced
        with the label on each pattern to determine which countries are clustered together in space.
        :param labels: pattern labels
        """
        for i in range(len(self.solution)):
            print(labels[i], ",", self.solution[i])
	class Clustering:
	"""
	An instance of the Clustering is a solution i.e. a particular partitioning of the (heterogeneous) data set into
	homogeneous subsets. For Centroid based clustering algorithms this involves looking at each pattern and assigning
	it to it's nearest centroid. This is done by calculating the distance between each pattern and every centroid and
	selecting the one with the smallest distance. Here we use are using fractional distance with the default parameters.

	:param d: dimensionality of the input patterns
	:param k: the pre-specified number of clusters & centroids
	:param z: the patterns in the data set
	:param min: the minimum distance (required to prevent division by zero)
	"""

	def __init__(self, d, k, z, min):
	# print("Initializing solution ...")
	"""
	Initializes a Clustering object with the specified parameters
	:param d: dimensionality of the input patterns
	:param k: the pre-specified number of clusters & centroids
	:param z: the patterns in the data set
	:param min: the minimum distance (required to prevent division by zero)
	"""
	self.dimensionality = d
	self.num_clusters = k
	self.patterns = z
	self.solution = []
	for i in range(len(z)):
	self.solution.append(0)
	self.centroids = np.random.rand(k, d)
	self.e = min

	def re_init(self):
	"""
	A method for reinitializing the solution
	"""
	self.centroids = None
	self.centroids = np.random.rand(self.num_clusters, self.dimensionality)

	def assign_patterns(self):
	"""
	This method iterates over all patterns and calculates the distances between that pattern and every centroid.
	These value are stored in [distances]. The assign_pattern method is then used to find the centroid with the
	smallest distance and update the 'label' i.e. centroid which the pattern is associated.
	"""
	s = Similarity(self.e)
	# for each pattern
	for i in range(len(self.patterns)):
	# for each centroid
	distances = []
	for j in range(self.num_clusters):
	# calculate the distances
	distances.append(s.fractional_distance(self.centroids[j], self.patterns[i]))
	# assign the pattern to a cluster
	self.assign_pattern(distances, i)

	def assign_pattern(self, distances, index):
	"""
	This method updates the label i.e. centroid index \in (0, k-1) to which pattern z(index) belongs
	:param distances: distances to each centroid
	:param index: the index of the pattern we are assigning in z
	"""
	self.solution[index] = 0
	smallest = distances[self.solution[index]]
	for i in range(len(distances)):
	if distances[i] < smallest:
	smallest = distances[i]
	self.solution[index] = i

	def update_centroids(self, s=1.0):
	"""
	This method implements the mean-shift heuristic used by the K-means clustering algorithm. This heuristic
	updates the value of each centroid with the average value at each dimension of the patterns assigned to it.
	:param s: this is the scaling factor i.e. how much we want to diminish the movement of the centroids by
	"""
	# Step 1 - initialize a variable to store the sum at each dimension of the patterns assigned to each centroid
	centroids_sum = []
	for i in range(self.num_clusters):
	centroids_sum.append([])
	for j in range(self.dimensionality):
	centroids_sum[i].append(0.0)
	# Step 2 - initialize a variable to store the count of patterns assigned to each centroid
	centroids_count = []
	for i in range(self.num_clusters):
	centroids_count.append(0.0)
	# Step 3 - Update the value of centroids_sum and centroids_count for step 4
	for i in range(len(self.solution)):
	for j in range(self.dimensionality):
	centroids_sum[self.solution[i]][j] += self.patterns[i][j]
	centroids_count[self.solution[i]] += 1
	# Step 4 - compute the averages (total / count) for each dimension for each centroid
	centroids_average = []
	for i in range(self.num_clusters):
	centroids_average.append([])
	for j in range(self.dimensionality):
	if centroids_count[i] > 0:
	centroids_average[i].append(centroids_sum[i][j] / max(1.0, centroids_count[i]))
	else:
	centroids_average[i].append(random.random())
	# Step 5 - set each dimension of each centroid to the average of it's clusters values at that dimension
	for i in range(self.num_clusters):
	if s == 1.0:
	self.centroids[i] = None
	self.centroids[i] = centroids_average[i]
	else:
	for j in range(len(self.centroids[i])):
	self.centroids[i][j] += (centroids_average[i][j] - self.centroids[i][j]) * s

	def k_means_clustering(self, n, s=1.0):
	"""
	This method performs the K-means clustering algorithm on the data for n iterations. This involves updating the
	centroids using the mean-shift heuristic n-times and reassigning the patterns to their closest centroids.
	:param n: number of iterations to complete
	:param s: the scaling factor to use when updating the centroids
	pick on which has a better solution (according to some measure of cluster quality)
	"""

	for i in range(n):
	self.assign_patterns()
	self.update_centroids(s)

	def print_solution(self, labels):
	"""
	Prints out the clustering i.e. which patterns are assigned to which centroids. This can be cross-referenced
	with the label on each pattern to determine which countries are clustered together in space.
	:param labels: pattern labels
	"""
	for i in range(len(self.solution)):
	print(labels[i], ",", self.solution[i])