Skip to content

Instantly share code, notes, and snippets.

@willtownes
Created August 22, 2018 15:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save willtownes/81c92018b32d16a6bca9f651a00182ae to your computer and use it in GitHub Desktop.
Save willtownes/81c92018b32d16a6bca9f651a00182ae to your computer and use it in GitHub Desktop.
Fuzzy Clustering Jaccard and Rand Indices
#Fuzzy version of Jaccard and Rand Indices
#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures"
L<-4 #number of clusters
N<-50 #number of objects
X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering
y<-apply(X,1,which.max) #hard cluster version of X
table(y)
Y<-model.matrix(~factor(y)-1) #hard cluster version of X
Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering
W<-t(rmultinom(N,1,rep(1/L,L))) #a different random hard clustering
w<-apply(W,1,which.max)
cluster_similarity<-function(P,Pstar,index=c("jaccard","rand")){
#P,Pstar are fuzzy clusterings with objects in rows, clusters in cols (matrices)
#alternatively, P or Pstar can be vectors indicating hard clustering assignments
#if(is.vector(P) && is.vector(Pstar)){
#both are hard clusterings
# return(clusteval::cluster_similarity(P,Pstar,similarity=index))
#}
index<-match.arg(index)
if(is.vector(P)){ P<-model.matrix(~factor(P)-1) }
if(is.vector(Pstar)){ Pstar<-model.matrix(~factor(Pstar)-1) }
u<-1-dist(Pstar,method="manhattan")/2
v<-1-dist(P,method="manhattan")/2
a<-sum((1-abs(u-v))*u*v)
b<-sum(pmax(u-v,0))
c<-sum(pmax(v-u,0))
if(index=="jaccard"){
if(a==0){ return(0) } #edge case where all clusters are singleton
return(a/(a+b+c))
} else if(index=="rand"){
d<-sum((1-abs(u-v))*(1-u*v))
return((a+d)/(a+b+c+d))
}
}
#check agreement with existing packages for hard clusterings
clusteval::cluster_similarity(w,y,"jaccard")
cluster_similarity(W,Y,"jaccard")
clusteval::cluster_similarity(w,y,"rand")
cluster_similarity(W,Y,"rand")
#check that similarity=1 when comparing something to itself
cluster_similarity(X,X)==1
cluster_similarity(X,X,"rand")==1
#check symmetry
cluster_similarity(X,Y)==cluster_similarity(Y,X)
#jaccard=0 when all singleton clusters
cluster_similarity(c(1,2,3),c(1,2,3))==0
#check high value for hard clustering and very similar soft cl
cluster_similarity(X,Y)
cluster_similarity(X,Y,"rand")
#check low value for two random soft clusterings
cluster_similarity(X,Z)
cluster_similarity(X,Z,"rand")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment