zarch/mkclusters.py

## mkclusters.py
# -*- coding: utf-8 -*-
"""
Created on Tue Jul  8 15:31:10 2014

@author: pietro
"""
import numpy as np

from sklearn.cluster import (MeanShift, MiniBatchKMeans, Ward, KMeans,
                             SpectralClustering, DBSCAN, AffinityPropagation)
from sklearn.cluster import estimate_bandwidth
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import seaborn as sns

from grass.pygrass.vector import VectorTopo
from grass.pygrass.vector.geometry import Point


# define columns of the vector output map
COLS = [("cat", "INTEGER PRIMARY KEY"),
        ("label", "INTEGER")]


def get_data_clusters(X, n_clusters=3, quantile=0.3, n_neighnors=10,
                      with_mean=True, with_std=True,
                      meanshift=None, minibatchkmeans=None, ward=None,
                      spectral=None, dbscan=None, affinity=None, kmeans=None):
    # set parameters if given
    meanshift = meanshift if meanshift else dict()
    minibatchkmeans = minibatchkmeans if minibatchkmeans else dict()
    ward = ward if ward else dict()
    spectral = spectral if spectral else dict()
    dbscan = dbscan if dbscan else dict()
    affinity = affinity if affinity else dict()
    kmeans = kmeans if kmeans else dict()
    # normalize dataset for easier parameter selection
    scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
    X = scaler.fit_transform(X)
    # estimate bandwidth for mean shift
    bandwidth = estimate_bandwidth(X, quantile=quantile)
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=n_neighnors)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    # create clustering estimators
    clusters = {"kmean": KMeans(n_clusters=n_clusters, **kmeans),
                "meanshift": MeanShift(bandwidth=bandwidth, **meanshift),
                "minibatchkmeans": MiniBatchKMeans(n_clusters=n_clusters,
                                                   **minibatchkmeans),
                "ward": Ward(n_clusters=n_clusters, connectivity=connectivity,
                             **ward),
                "spectral": SpectralClustering(n_clusters=n_clusters,
                                               **spectral),
                "dbscan": DBSCAN(**dbscan),
                "affinity": AffinityPropagation(**affinity)}
    return X, clusters


def mkcluster(vname, mapset='', overwrite=False,
              linestyle='', marker='o', markersize=5,
              palette='husl', context='paper',
              figname='%s.svg', save=None, **options):
    # open the input vector map
    print("Reading %s vector points" %
          ("%s@%s" % (vname, mapset) if mapset else vname))
    with VectorTopo(vname, mapset, mode="r") as ivct:
        coords = np.array([pnt.coords() for pnt in ivct])

    # extract coordinates to write the output features
    xcoords, ycoords = coords.T
    scaled, clusters = get_data_clusters(coords, **options)
    for clust in sorted(clusters.keys()):
        # open the output vector map
        with VectorTopo(clust, mode='w', tab_cols=COLS,
                        overwrite=overwrite) as ovct:
            print("Computing the cluster using: %s" % clust)
            labels = clusters[clust].fit(scaled).labels_.astype(np.int)
            # write the points to the vector map
            print("Writing vector map: %s" % clust)
            for x, y, l in zip(xcoords, ycoords, labels):
                ovct.write(Point(x, y), attrs=(l, ))

        labs = sorted(set(labels))
        fig, ax = plt.subplots()
        ax.set_title(repr(clusters[clust]))
        sns.despine()
        sns.set_context(context)
        if labs[0] == -1:
            # points without label => in black
            indx = labels == -1
            ax.plot(xcoords[indx], ycoords[indx], linestyle='',
                    marker=marker, markersize=int(markersize*0.7),
                    color=(0, 0, 0), label=str(-1))
            labs.pop(0)
        colors = sns.color_palette(palette, len(labs))
        for label, color in zip(labs, colors):
            indx = labels == label
            ax.plot(xcoords[indx], ycoords[indx], linestyle=linestyle,
                    marker=marker, markersize=markersize,
                    color=color, label=str(label))
        ax.legend(*ax.get_legend_handles_labels(), loc='best')
        if save:
            fig.savefig(figname % clust, **save)


options = {"n_clusters": 5,
           "quantile": 0.1,
           "n_neighnors":30,
           "meanshift": dict(bin_seeding=True),
           "minibatchkmeans": dict(),
           "ward": dict(),
           "spectral": dict(eigen_solver='arpack',
                            affinity="nearest_neighbors"),
           "dbscan": dict(eps=0.2, min_samples=5),
           "affinity": dict(damping=.9, preference=-200),
           "kmeans": dict(n_init=1)}

mkcluster("points_of_interest", "PERMANENT", overwrite=True,
          save=dict(dpi=300, format='svg', bbox_inches='tight'), **options)
	# -- coding: utf-8 --
	"""
	Created on Tue Jul 8 15:31:10 2014

	@author: pietro
	"""
	import numpy as np

	from sklearn.cluster import (MeanShift, MiniBatchKMeans, Ward, KMeans,
	SpectralClustering, DBSCAN, AffinityPropagation)
	from sklearn.cluster import estimate_bandwidth
	from sklearn.neighbors import kneighbors_graph
	from sklearn.preprocessing import StandardScaler

	import matplotlib.pyplot as plt

	import seaborn as sns

	from grass.pygrass.vector import VectorTopo
	from grass.pygrass.vector.geometry import Point


	# define columns of the vector output map
	COLS = [("cat", "INTEGER PRIMARY KEY"),
	("label", "INTEGER")]


	def get_data_clusters(X, n_clusters=3, quantile=0.3, n_neighnors=10,
	with_mean=True, with_std=True,
	meanshift=None, minibatchkmeans=None, ward=None,
	spectral=None, dbscan=None, affinity=None, kmeans=None):
	# set parameters if given
	meanshift = meanshift if meanshift else dict()
	minibatchkmeans = minibatchkmeans if minibatchkmeans else dict()
	ward = ward if ward else dict()
	spectral = spectral if spectral else dict()
	dbscan = dbscan if dbscan else dict()
	affinity = affinity if affinity else dict()
	kmeans = kmeans if kmeans else dict()
	# normalize dataset for easier parameter selection
	scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
	X = scaler.fit_transform(X)
	# estimate bandwidth for mean shift
	bandwidth = estimate_bandwidth(X, quantile=quantile)
	# connectivity matrix for structured Ward
	connectivity = kneighbors_graph(X, n_neighbors=n_neighnors)
	# make connectivity symmetric
	connectivity = 0.5 * (connectivity + connectivity.T)
	# create clustering estimators
	clusters = {"kmean": KMeans(n_clusters=n_clusters, **kmeans),
	"meanshift": MeanShift(bandwidth=bandwidth, **meanshift),
	"minibatchkmeans": MiniBatchKMeans(n_clusters=n_clusters,
	**minibatchkmeans),
	"ward": Ward(n_clusters=n_clusters, connectivity=connectivity,
	**ward),
	"spectral": SpectralClustering(n_clusters=n_clusters,
	**spectral),
	"dbscan": DBSCAN(**dbscan),
	"affinity": AffinityPropagation(**affinity)}
	return X, clusters


	def mkcluster(vname, mapset='', overwrite=False,
	linestyle='', marker='o', markersize=5,
	palette='husl', context='paper',
	figname='%s.svg', save=None, **options):
	# open the input vector map
	print("Reading %s vector points" %
	("%s@%s" % (vname, mapset) if mapset else vname))
	with VectorTopo(vname, mapset, mode="r") as ivct:
	coords = np.array([pnt.coords() for pnt in ivct])

	# extract coordinates to write the output features
	xcoords, ycoords = coords.T
	scaled, clusters = get_data_clusters(coords, **options)
	for clust in sorted(clusters.keys()):
	# open the output vector map
	with VectorTopo(clust, mode='w', tab_cols=COLS,
	overwrite=overwrite) as ovct:
	print("Computing the cluster using: %s" % clust)
	labels = clusters[clust].fit(scaled).labels_.astype(np.int)
	# write the points to the vector map
	print("Writing vector map: %s" % clust)
	for x, y, l in zip(xcoords, ycoords, labels):
	ovct.write(Point(x, y), attrs=(l, ))

	labs = sorted(set(labels))
	fig, ax = plt.subplots()
	ax.set_title(repr(clusters[clust]))
	sns.despine()
	sns.set_context(context)
	if labs[0] == -1:
	# points without label => in black
	indx = labels == -1
	ax.plot(xcoords[indx], ycoords[indx], linestyle='',
	marker=marker, markersize=int(markersize*0.7),
	color=(0, 0, 0), label=str(-1))
	labs.pop(0)
	colors = sns.color_palette(palette, len(labs))
	for label, color in zip(labs, colors):
	indx = labels == label
	ax.plot(xcoords[indx], ycoords[indx], linestyle=linestyle,
	marker=marker, markersize=markersize,
	color=color, label=str(label))
	ax.legend(*ax.get_legend_handles_labels(), loc='best')
	if save:
	fig.savefig(figname % clust, **save)


	options = {"n_clusters": 5,
	"quantile": 0.1,
	"n_neighnors":30,
	"meanshift": dict(bin_seeding=True),
	"minibatchkmeans": dict(),
	"ward": dict(),
	"spectral": dict(eigen_solver='arpack',
	affinity="nearest_neighbors"),
	"dbscan": dict(eps=0.2, min_samples=5),
	"affinity": dict(damping=.9, preference=-200),
	"kmeans": dict(n_init=1)}

	mkcluster("points_of_interest", "PERMANENT", overwrite=True,
	save=dict(dpi=300, format='svg', bbox_inches='tight'), **options)