Skip to content

Instantly share code, notes, and snippets.

@hershkoy
Last active April 28, 2020 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hershkoy/d1bcc85da46bb650b95be0b9ced3bbce to your computer and use it in GitHub Desktop.
Save hershkoy/d1bcc85da46bb650b95be0b9ced3bbce to your computer and use it in GitHub Desktop.
KNN clustering
Start with one random point in first cluster.
"Grow" the cluster to near neighbors (below threshold).
Repeat until all cluster is discovered.
Then start with new cluster.
Agglomerate small clusters at the end
Benefits:
Unsupervised clustering (not need to supply k like in kmeans).
Drawbacks:
running time is proportional to number of points (so doesn't scale well). But good enough for few thousands points
threshold = 0.2
min_cluster_ratio = 0.07 # percent of number of points
all_points = set(np.arange(X_pca.shape[0]))
remaining_points = all_points
clustered_points = set()
clusters = []
outliers = []
cluster = set([0])
points2chk = set([0])
idx2cluster = (np.ones(X_pca.shape[0])*-1).astype('int32')
min_cluster_size = round(X_pca.shape[0]*min_cluster_ratio)
dist = cdist(X_pca,X_pca,'euclidean')
while True:
#print("cluster:",len(cluster))
pos = points2chk.pop()
new_points = indices[pos,:]
new_points = set(new_points)-clustered_points
points2chk.update(new_points-cluster)
cluster.update(new_points)
if len(points2chk)==0:
#try to add more points to current cluster that are within threshold distance from cluster points
candidates = dist[list(cluster),:]
candidates = set(np.transpose((candidates<threshold).nonzero())[:,1])
points2chk.update(candidates-cluster-clustered_points)
if len(points2chk)==0:
#try to start a new cluster
print("remaining_points:",len(remaining_points),"cluster:",len(cluster))
idx2cluster[list(cluster)]=len(clusters)
clusters.append(cluster)
clustered_points = clustered_points.union(cluster)
remaining_points= remaining_points-cluster
if len(remaining_points)==0:
break
pos = np.argmin(distances[list(remaining_points),1])
pos = list(remaining_points)[pos]
cluster = set([pos])
points2chk = set([pos])
#agglomerate small clusters
num_points_in_cluster = [len(c) for c in clusters]
distance_to_near_cluster=[]
closest_cluster = (np.array([])).astype('int32')
avg_within_cluster_distance = []
for i in range(len(clusters)):
c= clusters[i]
avg_within_cluster_distance.append(np.mean(distances[list(c),1:]))
non_cluster_points = list(all_points-c)
non_cluster_distance_matrix = dist[np.ix_(list(c),non_cluster_points)]
distance_to_near_cluster.append(np.min(non_cluster_distance_matrix))
pos = np.argmin(np.min(non_cluster_distance_matrix,axis=0))
pos = non_cluster_points[pos]
closest_cluster = np.append(closest_cluster,idx2cluster[pos])
for i in range(len(clusters)):
if num_points_in_cluster[i]>min_cluster_size:
continue
clusters[closest_cluster[i]].update(clusters[i])
closest_cluster[closest_cluster==i] = closest_cluster[i]
clusters[i] = set()
clusters = [c for c in clusters if len(c)>0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment