Implementation of the K-Means Clustering Algorithm
class Clustering: | |
""" | |
An instance of the Clustering is a solution i.e. a particular partitioning of the (heterogeneous) data set into | |
homogeneous subsets. For Centroid based clustering algorithms this involves looking at each pattern and assigning | |
it to it's nearest centroid. This is done by calculating the distance between each pattern and every centroid and | |
selecting the one with the smallest distance. Here we use are using fractional distance with the default parameters. | |
:param d: dimensionality of the input patterns | |
:param k: the pre-specified number of clusters & centroids | |
:param z: the patterns in the data set | |
:param min: the minimum distance (required to prevent division by zero) | |
""" | |
def __init__(self, d, k, z, min): | |
# print("Initializing solution ...") | |
""" | |
Initializes a Clustering object with the specified parameters | |
:param d: dimensionality of the input patterns | |
:param k: the pre-specified number of clusters & centroids | |
:param z: the patterns in the data set | |
:param min: the minimum distance (required to prevent division by zero) | |
""" | |
self.dimensionality = d | |
self.num_clusters = k | |
self.patterns = z | |
self.solution = [] | |
for i in range(len(z)): | |
self.solution.append(0) | |
self.centroids = np.random.rand(k, d) | |
self.e = min | |
def re_init(self): | |
""" | |
A method for reinitializing the solution | |
""" | |
self.centroids = None | |
self.centroids = np.random.rand(self.num_clusters, self.dimensionality) | |
def assign_patterns(self): | |
""" | |
This method iterates over all patterns and calculates the distances between that pattern and every centroid. | |
These value are stored in [distances]. The assign_pattern method is then used to find the centroid with the | |
smallest distance and update the 'label' i.e. centroid which the pattern is associated. | |
""" | |
s = Similarity(self.e) | |
# for each pattern | |
for i in range(len(self.patterns)): | |
# for each centroid | |
distances = [] | |
for j in range(self.num_clusters): | |
# calculate the distances | |
distances.append(s.fractional_distance(self.centroids[j], self.patterns[i])) | |
# assign the pattern to a cluster | |
self.assign_pattern(distances, i) | |
def assign_pattern(self, distances, index): | |
""" | |
This method updates the label i.e. centroid index \in (0, k-1) to which pattern z(index) belongs | |
:param distances: distances to each centroid | |
:param index: the index of the pattern we are assigning in z | |
""" | |
self.solution[index] = 0 | |
smallest = distances[self.solution[index]] | |
for i in range(len(distances)): | |
if distances[i] < smallest: | |
smallest = distances[i] | |
self.solution[index] = i | |
def update_centroids(self, s=1.0): | |
""" | |
This method implements the mean-shift heuristic used by the K-means clustering algorithm. This heuristic | |
updates the value of each centroid with the average value at each dimension of the patterns assigned to it. | |
:param s: this is the scaling factor i.e. how much we want to diminish the movement of the centroids by | |
""" | |
# Step 1 - initialize a variable to store the sum at each dimension of the patterns assigned to each centroid | |
centroids_sum = [] | |
for i in range(self.num_clusters): | |
centroids_sum.append([]) | |
for j in range(self.dimensionality): | |
centroids_sum[i].append(0.0) | |
# Step 2 - initialize a variable to store the count of patterns assigned to each centroid | |
centroids_count = [] | |
for i in range(self.num_clusters): | |
centroids_count.append(0.0) | |
# Step 3 - Update the value of centroids_sum and centroids_count for step 4 | |
for i in range(len(self.solution)): | |
for j in range(self.dimensionality): | |
centroids_sum[self.solution[i]][j] += self.patterns[i][j] | |
centroids_count[self.solution[i]] += 1 | |
# Step 4 - compute the averages (total / count) for each dimension for each centroid | |
centroids_average = [] | |
for i in range(self.num_clusters): | |
centroids_average.append([]) | |
for j in range(self.dimensionality): | |
if centroids_count[i] > 0: | |
centroids_average[i].append(centroids_sum[i][j] / max(1.0, centroids_count[i])) | |
else: | |
centroids_average[i].append(random.random()) | |
# Step 5 - set each dimension of each centroid to the average of it's clusters values at that dimension | |
for i in range(self.num_clusters): | |
if s == 1.0: | |
self.centroids[i] = None | |
self.centroids[i] = centroids_average[i] | |
else: | |
for j in range(len(self.centroids[i])): | |
self.centroids[i][j] += (centroids_average[i][j] - self.centroids[i][j]) * s | |
def k_means_clustering(self, n, s=1.0): | |
""" | |
This method performs the K-means clustering algorithm on the data for n iterations. This involves updating the | |
centroids using the mean-shift heuristic n-times and reassigning the patterns to their closest centroids. | |
:param n: number of iterations to complete | |
:param s: the scaling factor to use when updating the centroids | |
pick on which has a better solution (according to some measure of cluster quality) | |
""" | |
for i in range(n): | |
self.assign_patterns() | |
self.update_centroids(s) | |
def print_solution(self, labels): | |
""" | |
Prints out the clustering i.e. which patterns are assigned to which centroids. This can be cross-referenced | |
with the label on each pattern to determine which countries are clustered together in space. | |
:param labels: pattern labels | |
""" | |
for i in range(len(self.solution)): | |
print(labels[i], ",", self.solution[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment