Instantly share code, notes, and snippets.

Embed
What would you like to do?
Clustering Objective Functions
class ClusteringQuality:
"""
Instances of this class implement the two measures of clustering quality discussed in the article, namely the davies
bouldin index and the silhouette index. It also implements a number of useful helper methods.
:param solution: the clustering solution of type Clustering
:param minimum: the minimum distance allowable
"""
def __init__(self, solution, minimum):
"""
Initializes a ClusteringQuality object with a given Clustering solution and a minimum distance
:param solution: this is an object of type Clustering
:param minimum: this is the minimum distance allowed between two points
"""
assert isinstance(solution, Clustering)
self.solution = solution
self.e = minimum
def cluster_totals(self):
"""
This method calculates the total distance from every centroid to every pattern assigned to it. It also records
the number of patterns in each cluster which are used to compute average distances in cluster_averages()
:return: a two dimensional list of [total cluster distance, total patterns in cluster] for each centroid
"""
s = Similarity(self.e)
# create array (will be 2d) to store total internal cluster distances and cluster counts for each centroid
cluster_distances_counts = []
for i in range(self.solution.num_clusters):
ith_cluster_count = 0.0
ith_cluster_distance = 0.0
for z in range(len(self.solution.solution)):
# update the count and the total distance for the centroid z[i] belongs to (whichever one that is)
if self.solution.solution[z] == i:
ith_cluster_count += 1
ith_cluster_distance += s.fractional_distance(self.solution.patterns[z], self.solution.centroids[i])
# add the result to the 2d list
cluster_distances_counts.append([ith_cluster_distance, max(ith_cluster_count, 1.0)])
return np.array(cluster_distances_counts)
def cluster_averages(self):
"""
Receives output from cluster_totals() and computes the average distance per centroid
:return: average distance from each centroid to the patterns assigned to it
"""
# create list to store averages in
cluster_averages = []
# get the total internal cluster distances plus the counts for each centroid / cluster
cluster_distances_counts = self.cluster_totals()
for i in range(len(cluster_distances_counts)):
# calculate the averages and add it to the list
cluster_averages.append(cluster_distances_counts[i][0] / cluster_distances_counts[i][1])
return np.array(cluster_averages)
def davies_bouldin(self):
"""
This method computes the davies-bouldin (db) of a given clustering.
:return: the davies bouldin value of the clustering
"""
# get the average internal cluster distances
cluster_averages = self.cluster_averages()
# create variable for db
davies_bouldin = 0.0
s = Similarity(self.e)
# for each cluster / centroid i
for i in range(self.solution.num_clusters):
# for each cluster / centroid j
for j in range(self.solution.num_clusters):
# when i and j are not the same cluster / centroid
if j != i:
# calculate the distance between the two centroids of i and j
d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j])
# update the variable to equal to sum of internal cluster distances of clusters i and j divided by
# the previously computer value i.e. the distance between centroid i and centroid j
d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij
# update db is this is larger than any db seen before
davies_bouldin = max(d_ij, davies_bouldin)
return davies_bouldin
def silhouette_index(self, index):
"""
This method computes the silhouette index (si) for any given pattern between -1 and 1
:param index: the pattern we are looking at now
:return: the silhouette index for that pattern
"""
# store the total distance to each cluster
silhouette_totals = []
# store the number of patterns in each cluster
silhouette_counts = []
# initialize the variables
for i in range(self.solution.num_clusters):
silhouette_totals.append(0.0)
silhouette_counts.append(0.0)
s = Similarity(self.e)
for i in range(len(self.solution.patterns)):
# for every pattern other than the one we are calculating now
if i != index:
# get the distance between pattern[index] and that pattern
distance = s.fractional_distance(self.solution.patterns[i], self.solution.patterns[index])
# add that distance to the silhouette totals for the correct cluster
silhouette_totals[self.solution.solution[i]] += distance
# update the number of patterns in that cluster
silhouette_counts[self.solution.solution[i]] += 1
# setup variable to find the cluster (not equal to the pattern[index]'s cluster) with the smallest distance
smallest_silhouette = silhouette_totals[0] / max(1.0, silhouette_counts[0])
for i in range(len(silhouette_totals)):
# calculate the average distance of each pattern in that cluster from pattern[index]
silhouette = silhouette_totals[i] / max(1.0, silhouette_counts[i])
# if the average distance is lower and it isn't pattern[index] cluster update the value
if silhouette < smallest_silhouette and i != self.solution.solution[index]:
smallest_silhouette = silhouette
# calculate the internal cluster distances for pattern[index]
index_cluster = self.solution.solution[index]
index_silhouette = self.e + silhouette_totals[index_cluster] / max(1.0, silhouette_counts[index_cluster])
# return the ratio between the smallest distance from pattern[index] to another cluster's patterns and
# the patterns belong to the same cluster as pattern[index]
return (smallest_silhouette - index_silhouette) / max(smallest_silhouette, index_silhouette)
def silhouette_index_zero_one(self, index):
"""
Returns the silhouette index between 0 and 1 and makes it a minimization objective (easier)
:param index: the pattern we are looking at now
:return: the silhouette index for that pattern
"""
return 1 - ((1 + self.silhouette_index(index)) / 2.0)
def average_silhouette_index(self, scaled_zero_one=True):
"""
This method computes the average silhouette index value every pattern in the data set.
:param scaled_zero_one: allows you to scale the result between 0 and 1 and reverse the order
:return: the silhouette index of the given clustering
"""
silhouette_sum = 0.0
for i in range(len(self.solution.patterns)):
if scaled_zero_one:
silhouette_sum += self.silhouette_index_zero_one(i)
else:
silhouette_sum += self.silhouette_index(i)
return silhouette_sum / len(self.solution.patterns)
def quantization_error(self):
"""
This method calculates the quantization error of the given clustering
:return: the quantization error
"""
total_distance = 0.0
s = Similarity(self.e)
for i in range(len(self.solution.patterns)):
total_distance += math.pow(s.fractional_distance(self.solution.patterns[i],
self.solution.centroids[self.solution.solution[i]]), 2.0)
return total_distance / len(self.solution.patterns)
@ihsansatriawan

This comment has been minimized.

ihsansatriawan commented May 9, 2016

Thx for the code 👍 , but i think it's will be awesome if you can give example :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment