IshankGulati/GA_clustering.py

## GA_clustering.py
"""Clustering using Genetic Algorithm"""

# Author: Ishank Gulati <gulati.ishank@gmail.com>

from __future__ import print_function
from __future__ import division

import numpy as np
from scipy.spatial.distance import euclidean
import scipy.misc as spm
from matplotlib import pyplot
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state
from sklearn.utils import check_array


class GAclustering(BaseEstimator, ClusterMixin):
    """
    Genetic Algorithm based Clustering

    Reference
    ---------
    Genetic algorithm-based clustering technique.
    Ujjwal Maulik, Sanghamitra Bandyoypadhyay.
    2000.
    """

    def __init__(self, n_clusters=3, population_size=10, max_iter=100,
                 prob_crossover=0.8, prob_mutation=0.001, tol=1e-3,
                 verbose=0, random_state=None):

        self.n_clusters = n_clusters
        self.population_size = population_size
        self.max_iter = max_iter
        self.prob_crossover = prob_crossover
        self.prob_mutation = prob_mutation
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

    def _check_fit_data(self, X):
        X = check_array(X, dtype=np.float64)

        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples="+str(n_samples)+" should be\
                greater than number of clusters="+str(self.n_clusters))
        return X

    def _check_test_data(self, X):
        X = check_array(X, dtype=np.float64)

        n_samples, n_features = X.shape
        expected_n_features = self.centers_.shape[1]
        if not expected_n_features == n_features:
            raise ValueError("Incorrect number of features")
        return X

    def _compute_fitness(self, X, population, pop_labels, dist):
        n_samples, n_features = X.shape
        metric = np.zeros((self.population_size, n_features))
        for crm in xrange(self.population_size):
            dist.fill(0)
            centers = np.resize(population[crm], (self.n_clusters, n_features))
            for i in xrange(n_samples):
                for j in xrange(self.n_clusters):
                    cost = euclidean(X[i], centers[j])
                    dist[i, j] = cost

            pop_labels[:, crm] = dist.argmin(axis=1)
            for i in xrange(self.n_clusters):
                mask = pop_labels[:, crm] == i
                if np.sum(mask) == 0:
                    raise ValueError("Empty Cluster")
                temp = X[mask]
                count = np.shape(temp)[0]
                centers[i] = np.sum(temp, axis=0)/count

                metric_temp = np.zeros((count, n_features))
                for j in xrange(count):
                    metric_temp[j] = euclidean(temp[j], centers[i])
                metric[crm] += np.sum(metric_temp, axis=0)
            centers = np.reshape(centers, (1, self.n_clusters*n_features))
            population[crm] = centers
        self.fitness_[:, 0] = np.sum(1/metric, axis=1)

    def _selection(self, population, pop_labels, n_remove):
        for i in xrange(n_remove):
            min_idx = np.argmin(self.fitness_, axis=0)
            population = np.delete(population, min_idx, axis=0)
            pop_labels = np.delete(pop_labels, min_idx, axis=1)
            self.fitness_ = np.delete(self.fitness_, min_idx, axis=0)
        self.population_size -= n_remove

    def _crossover(self, population, n_features):
        n_crossovers = int(self.prob_crossover*self.population_size//2)
        children = np.zeros((n_crossovers*2, self.n_clusters*n_features))
        for cross in xrange(n_crossovers):
            rs = check_random_state(self.random_state)
            parents_idx = rs.randint(self.population_size, size=2)
            children[cross*2:cross*2 + 2, :] = population[parents_idx]
            cross_point = n_features * rs.randint(1, self.n_clusters)
            temp = children[cross*2, cross_point:]
            children[cross*2, cross_point:]=children[cross*2 + 1, cross_point:]
            children[cross*2 + 1, cross_point:] = temp
        return children

    def _mutation(self, cross_children, n_features):
        n_cross_child = cross_children.shape[0]
        n_mutations = int(self.prob_mutation * n_cross_child * self.n_clusters
                          * n_features)
        children = np.zeros((n_mutations, self.n_clusters*n_features))
        for mut in xrange(n_mutations):
            rs = check_random_state(self.random_state)
            idx = rs.randint(n_cross_child)
            delta = rs.uniform(0, 1)
            gene = rs.randint(self.n_clusters*n_features)
            sign = rs.randint(2)
            children[mut] = cross_children[idx]
            if sign == 0:
                if children[mut, gene] != 0:
                    children[mut, gene] += 2*delta*children[mut, gene]
                else:
                    children[mut, gene] = 2*delta
            else:
                if children[mut, gene] != 0:
                    children[mut, gene] -= 2*delta*children[mut, gene]
                else:
                    children[mut, gene] = -2*delta
        return children

    def fit(self, X, y=None):
        X = self._check_fit_data(X)

        if self.max_iter <= 0:
            raise ValueError("Maximum number of iterations must be greater \
                than zero")

        n_samples, n_features = X.shape
        if self.population_size*self.n_clusters > n_samples:
            raise ValueError("Population size is very large")

        rs = check_random_state(self.random_state)
        self.labels_ = np.zeros((n_samples))
        pop_labels = np.zeros((n_samples, self.population_size))
        population = np.zeros((self.population_size, self.n_clusters *
                               n_features))

        for crm in xrange(self.population_size):
            population_idx = rs.randint(n_samples, size=self.n_clusters)
            population[crm, :] = X[population_idx].ravel()

        self.centers_ = np.zeros((self.n_clusters, n_features))
        dist = np.zeros((n_samples, self.n_clusters))
        self.fitness_ = np.zeros((self.population_size, 1))
        old_population_size = self.population_size

        for itr in xrange(self.max_iter):
            self._compute_fitness(X, population, pop_labels, dist)
            fittest = np.argmax(self.fitness_, axis=0)
            self.centers_ = np.reshape(population[fittest, :],
                                       (self.n_clusters, n_features))
            labels_old = self.labels_
            self.labels_ = pop_labels[:, fittest].ravel()
            n_same = np.sum(labels_old == self.labels_)
            if 1-n_same/n_samples < self.tol:
                if self.verbose:
                    print("Converged at iteration " + str(itr+1))
                break

            n_remove = self.population_size-old_population_size
            self._selection(population, pop_labels, n_remove)

            self.population_size = old_population_size
            cross_children = self._crossover(population, n_features)
            mut_children = self._mutation(cross_children, n_features)

            np.append(population, cross_children, axis=0)
            np.append(population, mut_children, axis=0)

            old_population_size = self.population_size
            self.population_size = population.shape[0]

            new_labels = rs.randint(self.n_clusters, size=(n_samples,
                                    self.population_size-old_population_size))
            np.append(pop_labels, new_labels, axis=1)

        self.X_fit_ = X
        return self

    def predict(self, X):
        X = self._check_test_data(X)
        n_samples = X.shape[0]
        dist = np.zeros((n_samples, self.n_clusters))
        for i in xrange(n_samples):
            for j in xrange(self.n_clusters):
                cost = euclidean(X[i], self.centers_[j])
                dist[i, j] = cost
        return dist.argmin(axis=1)


def segmentImage():
    img = spm.lena()
    #img = spm.imread("orig.png")
    img = spm.imresize(img, (300, 300))
    height, width = img.shape
    clf = GAclustering(n_clusters=11, max_iter=100, population_size=5,
                       random_state=23, verbose=1)

    img_list = np.reshape(img, (height*width, 1))
    clf.fit(img_list)
    index = np.copy(clf.labels_)
    index = np.reshape(index, (height, width))
    axes = pyplot.gca()
    axes.imshow(index)
    pyplot.show(block=True)


if __name__ == '__main__':
    segmentImage()
	"""Clustering using Genetic Algorithm"""

	# Author: Ishank Gulati <gulati.ishank@gmail.com>

	from __future__ import print_function
	from __future__ import division

	import numpy as np
	from scipy.spatial.distance import euclidean
	import scipy.misc as spm
	from matplotlib import pyplot
	from sklearn.base import BaseEstimator, ClusterMixin
	from sklearn.utils import check_random_state
	from sklearn.utils import check_array


	class GAclustering(BaseEstimator, ClusterMixin):
	"""
	Genetic Algorithm based Clustering

	Reference
	---------
	Genetic algorithm-based clustering technique.
	Ujjwal Maulik, Sanghamitra Bandyoypadhyay.
	2000.
	"""

	def __init__(self, n_clusters=3, population_size=10, max_iter=100,
	prob_crossover=0.8, prob_mutation=0.001, tol=1e-3,
	verbose=0, random_state=None):

	self.n_clusters = n_clusters
	self.population_size = population_size
	self.max_iter = max_iter
	self.prob_crossover = prob_crossover
	self.prob_mutation = prob_mutation
	self.tol = tol
	self.verbose = verbose
	self.random_state = random_state

	def _check_fit_data(self, X):
	X = check_array(X, dtype=np.float64)

	n_samples, n_features = X.shape
	if n_samples < self.n_clusters:
	raise ValueError("Number of samples="+str(n_samples)+" should be\
	greater than number of clusters="+str(self.n_clusters))
	return X

	def _check_test_data(self, X):
	X = check_array(X, dtype=np.float64)

	n_samples, n_features = X.shape
	expected_n_features = self.centers_.shape[1]
	if not expected_n_features == n_features:
	raise ValueError("Incorrect number of features")
	return X

	def _compute_fitness(self, X, population, pop_labels, dist):
	n_samples, n_features = X.shape
	metric = np.zeros((self.population_size, n_features))
	for crm in xrange(self.population_size):
	dist.fill(0)
	centers = np.resize(population[crm], (self.n_clusters, n_features))
	for i in xrange(n_samples):
	for j in xrange(self.n_clusters):
	cost = euclidean(X[i], centers[j])
	dist[i, j] = cost

	pop_labels[:, crm] = dist.argmin(axis=1)
	for i in xrange(self.n_clusters):
	mask = pop_labels[:, crm] == i
	if np.sum(mask) == 0:
	raise ValueError("Empty Cluster")
	temp = X[mask]
	count = np.shape(temp)[0]
	centers[i] = np.sum(temp, axis=0)/count

	metric_temp = np.zeros((count, n_features))
	for j in xrange(count):
	metric_temp[j] = euclidean(temp[j], centers[i])
	metric[crm] += np.sum(metric_temp, axis=0)
	centers = np.reshape(centers, (1, self.n_clusters*n_features))
	population[crm] = centers
	self.fitness_[:, 0] = np.sum(1/metric, axis=1)

	def _selection(self, population, pop_labels, n_remove):
	for i in xrange(n_remove):
	min_idx = np.argmin(self.fitness_, axis=0)
	population = np.delete(population, min_idx, axis=0)
	pop_labels = np.delete(pop_labels, min_idx, axis=1)
	self.fitness_ = np.delete(self.fitness_, min_idx, axis=0)
	self.population_size -= n_remove

	def _crossover(self, population, n_features):
	n_crossovers = int(self.prob_crossover*self.population_size//2)
	children = np.zeros((n_crossovers2, self.n_clustersn_features))
	for cross in xrange(n_crossovers):
	rs = check_random_state(self.random_state)
	parents_idx = rs.randint(self.population_size, size=2)
	children[cross2:cross2 + 2, :] = population[parents_idx]
	cross_point = n_features * rs.randint(1, self.n_clusters)
	temp = children[cross*2, cross_point:]
	children[cross2, cross_point:]=children[cross2 + 1, cross_point:]
	children[cross*2 + 1, cross_point:] = temp
	return children

	def _mutation(self, cross_children, n_features):
	n_cross_child = cross_children.shape[0]
	n_mutations = int(self.prob_mutation * n_cross_child * self.n_clusters
	* n_features)
	children = np.zeros((n_mutations, self.n_clusters*n_features))
	for mut in xrange(n_mutations):
	rs = check_random_state(self.random_state)
	idx = rs.randint(n_cross_child)
	delta = rs.uniform(0, 1)
	gene = rs.randint(self.n_clusters*n_features)
	sign = rs.randint(2)
	children[mut] = cross_children[idx]
	if sign == 0:
	if children[mut, gene] != 0:
	children[mut, gene] += 2deltachildren[mut, gene]
	else:
	children[mut, gene] = 2*delta
	else:
	if children[mut, gene] != 0:
	children[mut, gene] -= 2deltachildren[mut, gene]
	else:
	children[mut, gene] = -2*delta
	return children

	def fit(self, X, y=None):
	X = self._check_fit_data(X)

	if self.max_iter <= 0:
	raise ValueError("Maximum number of iterations must be greater \
	than zero")

	n_samples, n_features = X.shape
	if self.population_size*self.n_clusters > n_samples:
	raise ValueError("Population size is very large")

	rs = check_random_state(self.random_state)
	self.labels_ = np.zeros((n_samples))
	pop_labels = np.zeros((n_samples, self.population_size))
	population = np.zeros((self.population_size, self.n_clusters *
	n_features))

	for crm in xrange(self.population_size):
	population_idx = rs.randint(n_samples, size=self.n_clusters)
	population[crm, :] = X[population_idx].ravel()

	self.centers_ = np.zeros((self.n_clusters, n_features))
	dist = np.zeros((n_samples, self.n_clusters))
	self.fitness_ = np.zeros((self.population_size, 1))
	old_population_size = self.population_size

	for itr in xrange(self.max_iter):
	self._compute_fitness(X, population, pop_labels, dist)
	fittest = np.argmax(self.fitness_, axis=0)
	self.centers_ = np.reshape(population[fittest, :],
	(self.n_clusters, n_features))
	labels_old = self.labels_
	self.labels_ = pop_labels[:, fittest].ravel()
	n_same = np.sum(labels_old == self.labels_)
	if 1-n_same/n_samples < self.tol:
	if self.verbose:
	print("Converged at iteration " + str(itr+1))
	break

	n_remove = self.population_size-old_population_size
	self._selection(population, pop_labels, n_remove)

	self.population_size = old_population_size
	cross_children = self._crossover(population, n_features)
	mut_children = self._mutation(cross_children, n_features)

	np.append(population, cross_children, axis=0)
	np.append(population, mut_children, axis=0)

	old_population_size = self.population_size
	self.population_size = population.shape[0]

	new_labels = rs.randint(self.n_clusters, size=(n_samples,
	self.population_size-old_population_size))
	np.append(pop_labels, new_labels, axis=1)

	self.X_fit_ = X
	return self

	def predict(self, X):
	X = self._check_test_data(X)
	n_samples = X.shape[0]
	dist = np.zeros((n_samples, self.n_clusters))
	for i in xrange(n_samples):
	for j in xrange(self.n_clusters):
	cost = euclidean(X[i], self.centers_[j])
	dist[i, j] = cost
	return dist.argmin(axis=1)


	def segmentImage():
	img = spm.lena()
	#img = spm.imread("orig.png")
	img = spm.imresize(img, (300, 300))
	height, width = img.shape
	clf = GAclustering(n_clusters=11, max_iter=100, population_size=5,
	random_state=23, verbose=1)

	img_list = np.reshape(img, (height*width, 1))
	clf.fit(img_list)
	index = np.copy(clf.labels_)
	index = np.reshape(index, (height, width))
	axes = pyplot.gca()
	axes.imshow(index)
	pyplot.show(block=True)


	if __name__ == '__main__':
	segmentImage()