gdbassett/canopy.py

## canopy.py
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

# X shoudl be a numpy matrix, very likely sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# T1 > T2 for overlapping clusters
# T1 = Distance to centroid point to not include in other clusters
# T2 = Distance to centroid point to include in cluster
# T1 > T2 for overlapping clusters
# T1 < T2 will have points which reside in no clusters
# T1 == T2 will cause all points to reside in mutually exclusive clusters
# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap.

def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
    canopies = dict()
    X1_dist = pairwise_distances(X, metric=distance_metric)
    canopy_points = set(range(X.shape[0]))
    while canopy_points:
        point = canopy_points.pop()
        i = len(canopies)
        canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
        canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
    if filemap:
        for canopy_id in canopies.keys():
            canopy = canopies.pop(canopy_id)
            canopy2 = {"c":filemap[canopy['c']], "points":list()}
            for point in canopy['points']:
                canopy2["points"].append(filemap[point])
            canopies[canopy_id] = canopy2
    return canopies
	from sklearn.metrics.pairwise import pairwise_distances
	import numpy as np

	# X shoudl be a numpy matrix, very likely sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
	# T1 > T2 for overlapping clusters
	# T1 = Distance to centroid point to not include in other clusters
	# T2 = Distance to centroid point to include in cluster
	# T1 > T2 for overlapping clusters
	# T1 < T2 will have points which reside in no clusters
	# T1 == T2 will cause all points to reside in mutually exclusive clusters
	# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
	# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap.

	def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
	canopies = dict()
	X1_dist = pairwise_distances(X, metric=distance_metric)
	canopy_points = set(range(X.shape[0]))
	while canopy_points:
	point = canopy_points.pop()
	i = len(canopies)
	canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
	canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
	if filemap:
	for canopy_id in canopies.keys():
	canopy = canopies.pop(canopy_id)
	canopy2 = {"c":filemap[canopy['c']], "points":list()}
	for point in canopy['points']:
	canopy2["points"].append(filemap[point])
	canopies[canopy_id] = canopy2
	return canopies