StuartGordonReid/ClusteringObjectiveFunctions.py

## ClusteringObjectiveFunctions.py
class ClusteringQuality:
    """
    Instances of this class implement the two measures of clustering quality discussed in the article, namely the davies
    bouldin index and the silhouette index. It also implements a number of useful helper methods.
    :param solution: the clustering solution of type Clustering
    :param minimum: the minimum distance allowable
    """

    def __init__(self, solution, minimum):
        """
        Initializes a ClusteringQuality object with a given Clustering solution and a minimum distance
        :param solution: this is an object of type Clustering
        :param minimum: this is the minimum distance allowed between two points
        """
        assert isinstance(solution, Clustering)
        self.solution = solution
        self.e = minimum

    def cluster_totals(self):
        """
        This method calculates the total distance from every centroid to every pattern assigned to it. It also records
        the number of patterns in each cluster which are used to compute average distances in cluster_averages()
        :return: a two dimensional list of [total cluster distance, total patterns in cluster] for each centroid
        """
        s = Similarity(self.e)
        # create array (will be 2d) to store total internal cluster distances and cluster counts for each centroid
        cluster_distances_counts = []
        for i in range(self.solution.num_clusters):
            ith_cluster_count = 0.0
            ith_cluster_distance = 0.0
            for z in range(len(self.solution.solution)):
                # update the count and the total distance for the centroid z[i] belongs to (whichever one that is)
                if self.solution.solution[z] == i:
                    ith_cluster_count += 1
                    ith_cluster_distance += s.fractional_distance(self.solution.patterns[z], self.solution.centroids[i])
            # add the result to the 2d list
            cluster_distances_counts.append([ith_cluster_distance, max(ith_cluster_count, 1.0)])
        return np.array(cluster_distances_counts)

    def cluster_averages(self):
        """
        Receives output from cluster_totals() and computes the average distance per centroid
        :return: average distance from each centroid to the patterns assigned to it
        """
        # create list to store averages in
        cluster_averages = []
        # get the total internal cluster distances plus the counts for each centroid / cluster
        cluster_distances_counts = self.cluster_totals()
        for i in range(len(cluster_distances_counts)):
            # calculate the averages and add it to the list
            cluster_averages.append(cluster_distances_counts[i][0] / cluster_distances_counts[i][1])
        return np.array(cluster_averages)

    def davies_bouldin(self):
        """
        This method computes the davies-bouldin (db) of a given clustering.
        :return: the davies bouldin value of the clustering
        """
        # get the average internal cluster distances
        cluster_averages = self.cluster_averages()
        # create variable for db
        davies_bouldin = 0.0
        s = Similarity(self.e)
        # for each cluster / centroid i
        for i in range(self.solution.num_clusters):
            # for each cluster / centroid j
            for j in range(self.solution.num_clusters):
                # when i and j are not the same cluster / centroid
                if j != i:
                    # calculate the distance between the two centroids of i and j
                    d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j])
                    # update the variable to equal to sum of internal cluster distances of clusters i and j divided by
                    # the previously computer value i.e. the distance between centroid i and centroid j
                    d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij
                    # update db is this is larger than any db seen before
                    davies_bouldin = max(d_ij, davies_bouldin)
        return davies_bouldin

    def silhouette_index(self, index):
        """
        This method computes the silhouette index (si) for any given pattern between -1 and 1
        :param index: the pattern we are looking at now
        :return: the silhouette index for that pattern
        """
        # store the total distance to each cluster
        silhouette_totals = []
        # store the number of patterns in each cluster
        silhouette_counts = []
        # initialize the variables
        for i in range(self.solution.num_clusters):
            silhouette_totals.append(0.0)
            silhouette_counts.append(0.0)
        s = Similarity(self.e)
        for i in range(len(self.solution.patterns)):
            # for every pattern other than the one we are calculating now
            if i != index:
                # get the distance between pattern[index] and that pattern
                distance = s.fractional_distance(self.solution.patterns[i], self.solution.patterns[index])
                # add that distance to the silhouette totals for the correct cluster
                silhouette_totals[self.solution.solution[i]] += distance
                # update the number of patterns in that cluster
                silhouette_counts[self.solution.solution[i]] += 1
        # setup variable to find the cluster (not equal to the pattern[index]'s cluster) with the smallest distance
        smallest_silhouette = silhouette_totals[0] / max(1.0, silhouette_counts[0])
        for i in range(len(silhouette_totals)):
            # calculate the average distance of each pattern in that cluster from pattern[index]
            silhouette = silhouette_totals[i] / max(1.0, silhouette_counts[i])
            # if the average distance is lower and it isn't pattern[index] cluster update the value
            if silhouette < smallest_silhouette and i != self.solution.solution[index]:
                smallest_silhouette = silhouette
        # calculate the internal cluster distances for pattern[index]
        index_cluster = self.solution.solution[index]
        index_silhouette = self.e + silhouette_totals[index_cluster] / max(1.0, silhouette_counts[index_cluster])
        # return the ratio between the smallest distance from pattern[index] to another cluster's patterns and
        # the patterns belong to the same cluster as pattern[index]
        return (smallest_silhouette - index_silhouette) / max(smallest_silhouette, index_silhouette)

    def silhouette_index_zero_one(self, index):
        """
        Returns the silhouette index between 0 and 1 and makes it a minimization objective (easier)
        :param index: the pattern we are looking at now
        :return: the silhouette index for that pattern
        """
        return 1 - ((1 + self.silhouette_index(index)) / 2.0)

    def average_silhouette_index(self, scaled_zero_one=True):
        """
        This method computes the average silhouette index value every pattern in the data set.
        :param scaled_zero_one: allows you to scale the result between 0 and 1 and reverse the order
        :return: the silhouette index of the given clustering
        """
        silhouette_sum = 0.0
        for i in range(len(self.solution.patterns)):
            if scaled_zero_one:
                silhouette_sum += self.silhouette_index_zero_one(i)
            else:
                silhouette_sum += self.silhouette_index(i)
        return silhouette_sum / len(self.solution.patterns)

    def quantization_error(self):
        """
        This method calculates the quantization error of the given clustering
        :return: the quantization error
        """
        total_distance = 0.0
        s = Similarity(self.e)
        for i in range(len(self.solution.patterns)):
            total_distance += math.pow(s.fractional_distance(self.solution.patterns[i],
                                                             self.solution.centroids[self.solution.solution[i]]), 2.0)
        return total_distance / len(self.solution.patterns)
	class ClusteringQuality:
	"""
	Instances of this class implement the two measures of clustering quality discussed in the article, namely the davies
	bouldin index and the silhouette index. It also implements a number of useful helper methods.
	:param solution: the clustering solution of type Clustering
	:param minimum: the minimum distance allowable
	"""

	def __init__(self, solution, minimum):
	"""
	Initializes a ClusteringQuality object with a given Clustering solution and a minimum distance
	:param solution: this is an object of type Clustering
	:param minimum: this is the minimum distance allowed between two points
	"""
	assert isinstance(solution, Clustering)
	self.solution = solution
	self.e = minimum

	def cluster_totals(self):
	"""
	This method calculates the total distance from every centroid to every pattern assigned to it. It also records
	the number of patterns in each cluster which are used to compute average distances in cluster_averages()
	:return: a two dimensional list of [total cluster distance, total patterns in cluster] for each centroid
	"""
	s = Similarity(self.e)
	# create array (will be 2d) to store total internal cluster distances and cluster counts for each centroid
	cluster_distances_counts = []
	for i in range(self.solution.num_clusters):
	ith_cluster_count = 0.0
	ith_cluster_distance = 0.0
	for z in range(len(self.solution.solution)):
	# update the count and the total distance for the centroid z[i] belongs to (whichever one that is)
	if self.solution.solution[z] == i:
	ith_cluster_count += 1
	ith_cluster_distance += s.fractional_distance(self.solution.patterns[z], self.solution.centroids[i])
	# add the result to the 2d list
	cluster_distances_counts.append([ith_cluster_distance, max(ith_cluster_count, 1.0)])
	return np.array(cluster_distances_counts)

	def cluster_averages(self):
	"""
	Receives output from cluster_totals() and computes the average distance per centroid
	:return: average distance from each centroid to the patterns assigned to it
	"""
	# create list to store averages in
	cluster_averages = []
	# get the total internal cluster distances plus the counts for each centroid / cluster
	cluster_distances_counts = self.cluster_totals()
	for i in range(len(cluster_distances_counts)):
	# calculate the averages and add it to the list
	cluster_averages.append(cluster_distances_counts[i][0] / cluster_distances_counts[i][1])
	return np.array(cluster_averages)

	def davies_bouldin(self):
	"""
	This method computes the davies-bouldin (db) of a given clustering.
	:return: the davies bouldin value of the clustering
	"""
	# get the average internal cluster distances
	cluster_averages = self.cluster_averages()
	# create variable for db
	davies_bouldin = 0.0
	s = Similarity(self.e)
	# for each cluster / centroid i
	for i in range(self.solution.num_clusters):
	# for each cluster / centroid j
	for j in range(self.solution.num_clusters):
	# when i and j are not the same cluster / centroid
	if j != i:
	# calculate the distance between the two centroids of i and j
	d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j])
	# update the variable to equal to sum of internal cluster distances of clusters i and j divided by
	# the previously computer value i.e. the distance between centroid i and centroid j
	d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij
	# update db is this is larger than any db seen before
	davies_bouldin = max(d_ij, davies_bouldin)
	return davies_bouldin

	def silhouette_index(self, index):
	"""
	This method computes the silhouette index (si) for any given pattern between -1 and 1
	:param index: the pattern we are looking at now
	:return: the silhouette index for that pattern
	"""
	# store the total distance to each cluster
	silhouette_totals = []
	# store the number of patterns in each cluster
	silhouette_counts = []
	# initialize the variables
	for i in range(self.solution.num_clusters):
	silhouette_totals.append(0.0)
	silhouette_counts.append(0.0)
	s = Similarity(self.e)
	for i in range(len(self.solution.patterns)):
	# for every pattern other than the one we are calculating now
	if i != index:
	# get the distance between pattern[index] and that pattern
	distance = s.fractional_distance(self.solution.patterns[i], self.solution.patterns[index])
	# add that distance to the silhouette totals for the correct cluster
	silhouette_totals[self.solution.solution[i]] += distance
	# update the number of patterns in that cluster
	silhouette_counts[self.solution.solution[i]] += 1
	# setup variable to find the cluster (not equal to the pattern[index]'s cluster) with the smallest distance
	smallest_silhouette = silhouette_totals[0] / max(1.0, silhouette_counts[0])
	for i in range(len(silhouette_totals)):
	# calculate the average distance of each pattern in that cluster from pattern[index]
	silhouette = silhouette_totals[i] / max(1.0, silhouette_counts[i])
	# if the average distance is lower and it isn't pattern[index] cluster update the value
	if silhouette < smallest_silhouette and i != self.solution.solution[index]:
	smallest_silhouette = silhouette
	# calculate the internal cluster distances for pattern[index]
	index_cluster = self.solution.solution[index]
	index_silhouette = self.e + silhouette_totals[index_cluster] / max(1.0, silhouette_counts[index_cluster])
	# return the ratio between the smallest distance from pattern[index] to another cluster's patterns and
	# the patterns belong to the same cluster as pattern[index]
	return (smallest_silhouette - index_silhouette) / max(smallest_silhouette, index_silhouette)

	def silhouette_index_zero_one(self, index):
	"""
	Returns the silhouette index between 0 and 1 and makes it a minimization objective (easier)
	:param index: the pattern we are looking at now
	:return: the silhouette index for that pattern
	"""
	return 1 - ((1 + self.silhouette_index(index)) / 2.0)

	def average_silhouette_index(self, scaled_zero_one=True):
	"""
	This method computes the average silhouette index value every pattern in the data set.
	:param scaled_zero_one: allows you to scale the result between 0 and 1 and reverse the order
	:return: the silhouette index of the given clustering
	"""
	silhouette_sum = 0.0
	for i in range(len(self.solution.patterns)):
	if scaled_zero_one:
	silhouette_sum += self.silhouette_index_zero_one(i)
	else:
	silhouette_sum += self.silhouette_index(i)
	return silhouette_sum / len(self.solution.patterns)

	def quantization_error(self):
	"""
	This method calculates the quantization error of the given clustering
	:return: the quantization error
	"""
	total_distance = 0.0
	s = Similarity(self.e)
	for i in range(len(self.solution.patterns)):
	total_distance += math.pow(s.fractional_distance(self.solution.patterns[i],
	self.solution.centroids[self.solution.solution[i]]), 2.0)
	return total_distance / len(self.solution.patterns)