public
Last active

Python implementation of SCAN: A Structural Clustering Algorithm for Networks

  • Download Gist
Scan.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
# -*- coding: utf-8 -*-
 
"""
SCAN: A Structural Clustering Algorithm for Networks
As described in http://ualr.edu/nxyuruk/publications/kdd07.pdf
"""
 
from collections import deque
import numpy as np
from scipy.sparse import csr_matrix
 
def struct_similarity(vcols, wcols):
""" Compute the similartiy normalized on geometric mean of vertices"""
# count the similar rows for unioning edges
count = [index for index in wcols if (index in vcols)]
# geomean
#need to account for vertex itself, add 2(1 for each vertex)
ans = (len(count) +2) / (((vcols.size+1)*(wcols.size+1)) ** .5)
return ans
 
def neighborhood(G, vertex_v, eps):
""" Returns the neighbors, as well as all the connected vertices """
N = deque()
vcols = vertex_v.tocoo().col
#check the similarity for each connected vertex
for index in vcols:
wcols = G[index,:].tocoo().col
if struct_similarity(vcols, wcols)> eps:
N.append(index)
return N, vcols
 
def scan(G, eps =0.7, mu=2):
"""
Vertex Structure = sum of row + itself(1)
Structural Similarity is the geometric mean of the 2Vertex size of structure
"""
c = 0
v = G.shape[0]
# All vertices are labeled as unclassified(-1)
vertex_labels = -np.ones(v)
# start with a neg core(every new core we incr by 1)
cluster_id = -1
for vertex in xrange(v):
N ,vcols = neighborhood(G, G[vertex,:],eps)
# must include vertex itself
N.appendleft(vertex)
if len(N) >= mu:
#print "we have a cluster at: %d ,with length %d " % (vertex, len(N))
# gen a new cluster id (0 indexed)
cluster_id +=1
while N:
y = N.pop()
R , ycols = neighborhood(G, G[y,:], eps)
# include itself
R.appendleft(y)
# (struct reachable) check core and if y is connected to vertex
if len(R) >= mu and y in vcols:
#print "we have a structure Reachable at: %d ,with length %d " % (y, len(R))
while R:
r = R.pop()
label = vertex_labels[r]
# if unclassified or non-member
if (label == -1) or (label==0):
vertex_labels[r] = cluster_id
# unclassified ??
if label == -1:
N.appendleft(r)
else:
vertex_labels[vertex] = 0
#classify non-members
for index in np.where(vertex_labels ==0)[0]:
ncols= G[index,:].tocoo().col
if len(ncols) >=2:
## mark as a hub
vertex_labels[index] = -2
continue
else:
## mark as outlier
vertex_labels[index] = -3
continue
 
return vertex_labels
 
if __name__=='__main__':
 
# Based on Example from paper
rows = [0,0,0,0,1,1,1,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,8,8,8,
9,9,9,9,10,10,10,10,11,11,11,11,
12,12,12,12,12,13]
cols = [1,4,5,6,0,5,2,1,5,3,2,4,5,6,0,3,5,6,
0,1,2,3,4,4,0,3,7,11,10,6,11,12,8,7,
12,9,8,12,10,13,9,12,11,6,7,12,10,6,
7,8,9,10,11,9]
data = np.ones(len(rows))
G =csr_matrix((data,(rows,cols)),shape=(14,14))
 
 
#print G.todense()
#print neighborhood(G, G[0,:],.4 )
print scan(G,.7, 2)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.