hershkoy/KNN ( K-nearest neighbours) Clustering.txt

## KNN ( K-nearest neighbours) Clustering.txt
Start with one random point in first cluster.
"Grow" the cluster to near neighbors (below threshold).
Repeat until all cluster is discovered.
Then start with new cluster.
Agglomerate small clusters at the end

Benefits:
Unsupervised clustering (not need to supply k like in kmeans).

Drawbacks:
running time is proportional to number of points (so doesn't scale well). But good enough for few thousands points

## knn_clustering.py
threshold = 0.2
min_cluster_ratio = 0.07 # percent of number of points

all_points = set(np.arange(X_pca.shape[0]))
remaining_points = all_points
clustered_points = set()
clusters = []
outliers = []
cluster = set([0])
points2chk = set([0])
idx2cluster = (np.ones(X_pca.shape[0])*-1).astype('int32')

min_cluster_size = round(X_pca.shape[0]*min_cluster_ratio)
dist = cdist(X_pca,X_pca,'euclidean')

while True:
    #print("cluster:",len(cluster))
    pos = points2chk.pop()
    new_points = indices[pos,:]
    new_points = set(new_points)-clustered_points
    points2chk.update(new_points-cluster)
    cluster.update(new_points)

    if len(points2chk)==0:
        #try to add more points to current cluster that are within threshold distance from cluster points
        candidates = dist[list(cluster),:]
        candidates = set(np.transpose((candidates<threshold).nonzero())[:,1])
        points2chk.update(candidates-cluster-clustered_points)

    if len(points2chk)==0:
        #try to start a new cluster
        print("remaining_points:",len(remaining_points),"cluster:",len(cluster))
        idx2cluster[list(cluster)]=len(clusters)
        clusters.append(cluster)
        clustered_points = clustered_points.union(cluster)
        remaining_points= remaining_points-cluster
        if len(remaining_points)==0:
            break

        pos = np.argmin(distances[list(remaining_points),1])
        pos = list(remaining_points)[pos]
        cluster = set([pos])
        points2chk = set([pos])

#agglomerate small clusters
num_points_in_cluster = [len(c) for c in clusters]

distance_to_near_cluster=[]
closest_cluster = (np.array([])).astype('int32')
avg_within_cluster_distance = []
for i in range(len(clusters)):
    c= clusters[i]
    avg_within_cluster_distance.append(np.mean(distances[list(c),1:]))
    non_cluster_points = list(all_points-c)
    non_cluster_distance_matrix = dist[np.ix_(list(c),non_cluster_points)]
    distance_to_near_cluster.append(np.min(non_cluster_distance_matrix))
    pos = np.argmin(np.min(non_cluster_distance_matrix,axis=0))
    pos = non_cluster_points[pos]
    closest_cluster = np.append(closest_cluster,idx2cluster[pos])

for i in range(len(clusters)):
    if num_points_in_cluster[i]>min_cluster_size:
        continue
    clusters[closest_cluster[i]].update(clusters[i])
    closest_cluster[closest_cluster==i] = closest_cluster[i]
    clusters[i] = set()

clusters = [c for c in clusters if len(c)>0]
	Start with one random point in first cluster.
	"Grow" the cluster to near neighbors (below threshold).
	Repeat until all cluster is discovered.
	Then start with new cluster.
	Agglomerate small clusters at the end

	Benefits:
	Unsupervised clustering (not need to supply k like in kmeans).

	Drawbacks:
	running time is proportional to number of points (so doesn't scale well). But good enough for few thousands points
	threshold = 0.2
	min_cluster_ratio = 0.07 # percent of number of points

	all_points = set(np.arange(X_pca.shape[0]))
	remaining_points = all_points
	clustered_points = set()
	clusters = []
	outliers = []
	cluster = set([0])
	points2chk = set([0])
	idx2cluster = (np.ones(X_pca.shape[0])*-1).astype('int32')

	min_cluster_size = round(X_pca.shape[0]*min_cluster_ratio)
	dist = cdist(X_pca,X_pca,'euclidean')

	while True:
	#print("cluster:",len(cluster))
	pos = points2chk.pop()
	new_points = indices[pos,:]
	new_points = set(new_points)-clustered_points
	points2chk.update(new_points-cluster)
	cluster.update(new_points)

	if len(points2chk)==0:
	#try to add more points to current cluster that are within threshold distance from cluster points
	candidates = dist[list(cluster),:]
	candidates = set(np.transpose((candidates<threshold).nonzero())[:,1])
	points2chk.update(candidates-cluster-clustered_points)

	if len(points2chk)==0:
	#try to start a new cluster
	print("remaining_points:",len(remaining_points),"cluster:",len(cluster))
	idx2cluster[list(cluster)]=len(clusters)
	clusters.append(cluster)
	clustered_points = clustered_points.union(cluster)
	remaining_points= remaining_points-cluster
	if len(remaining_points)==0:
	break

	pos = np.argmin(distances[list(remaining_points),1])
	pos = list(remaining_points)[pos]
	cluster = set([pos])
	points2chk = set([pos])

	#agglomerate small clusters
	num_points_in_cluster = [len(c) for c in clusters]

	distance_to_near_cluster=[]
	closest_cluster = (np.array([])).astype('int32')
	avg_within_cluster_distance = []
	for i in range(len(clusters)):
	c= clusters[i]
	avg_within_cluster_distance.append(np.mean(distances[list(c),1:]))
	non_cluster_points = list(all_points-c)
	non_cluster_distance_matrix = dist[np.ix_(list(c),non_cluster_points)]
	distance_to_near_cluster.append(np.min(non_cluster_distance_matrix))
	pos = np.argmin(np.min(non_cluster_distance_matrix,axis=0))
	pos = non_cluster_points[pos]
	closest_cluster = np.append(closest_cluster,idx2cluster[pos])

	for i in range(len(clusters)):
	if num_points_in_cluster[i]>min_cluster_size:
	continue
	clusters[closest_cluster[i]].update(clusters[i])
	closest_cluster[closest_cluster==i] = closest_cluster[i]
	clusters[i] = set()

	clusters = [c for c in clusters if len(c)>0]