willtownes/fuzzy_clustering_eval.R

## fuzzy_clustering_eval.R
#Fuzzy version of Jaccard and Rand Indices
#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures"

L<-4 #number of clusters
N<-50 #number of objects
X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering
y<-apply(X,1,which.max) #hard cluster version of X
table(y)
Y<-model.matrix(~factor(y)-1) #hard cluster version of X
Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering
W<-t(rmultinom(N,1,rep(1/L,L))) #a different random hard clustering
w<-apply(W,1,which.max)

cluster_similarity<-function(P,Pstar,index=c("jaccard","rand")){
  #P,Pstar are fuzzy clusterings with objects in rows, clusters in cols (matrices)
  #alternatively, P or Pstar can be vectors indicating hard clustering assignments
  #if(is.vector(P) && is.vector(Pstar)){
    #both are hard clusterings
  #  return(clusteval::cluster_similarity(P,Pstar,similarity=index))
  #}
  index<-match.arg(index)
  if(is.vector(P)){ P<-model.matrix(~factor(P)-1) }
  if(is.vector(Pstar)){ Pstar<-model.matrix(~factor(Pstar)-1) }
  u<-1-dist(Pstar,method="manhattan")/2
  v<-1-dist(P,method="manhattan")/2
  a<-sum((1-abs(u-v))*u*v)
  b<-sum(pmax(u-v,0))
  c<-sum(pmax(v-u,0))
  if(index=="jaccard"){
    if(a==0){ return(0) } #edge case where all clusters are singleton
    return(a/(a+b+c))
  } else if(index=="rand"){
    d<-sum((1-abs(u-v))*(1-u*v))
    return((a+d)/(a+b+c+d))
  }
}

#check agreement with existing packages for hard clusterings
clusteval::cluster_similarity(w,y,"jaccard")
cluster_similarity(W,Y,"jaccard")
clusteval::cluster_similarity(w,y,"rand")
cluster_similarity(W,Y,"rand")
#check that similarity=1 when comparing something to itself
cluster_similarity(X,X)==1
cluster_similarity(X,X,"rand")==1
#check symmetry
cluster_similarity(X,Y)==cluster_similarity(Y,X)
#jaccard=0 when all singleton clusters
cluster_similarity(c(1,2,3),c(1,2,3))==0
#check high value for hard clustering and very similar soft cl
cluster_similarity(X,Y)
cluster_similarity(X,Y,"rand")
#check low value for two random soft clusterings
cluster_similarity(X,Z)
cluster_similarity(X,Z,"rand")
	#Fuzzy version of Jaccard and Rand Indices
	#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures"

	L<-4 #number of clusters
	N<-50 #number of objects
	X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering
	y<-apply(X,1,which.max) #hard cluster version of X
	table(y)
	Y<-model.matrix(~factor(y)-1) #hard cluster version of X
	Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering
	W<-t(rmultinom(N,1,rep(1/L,L))) #a different random hard clustering
	w<-apply(W,1,which.max)

	cluster_similarity<-function(P,Pstar,index=c("jaccard","rand")){
	#P,Pstar are fuzzy clusterings with objects in rows, clusters in cols (matrices)
	#alternatively, P or Pstar can be vectors indicating hard clustering assignments
	#if(is.vector(P) && is.vector(Pstar)){
	#both are hard clusterings
	# return(clusteval::cluster_similarity(P,Pstar,similarity=index))
	#}
	index<-match.arg(index)
	if(is.vector(P)){ P<-model.matrix(~factor(P)-1) }
	if(is.vector(Pstar)){ Pstar<-model.matrix(~factor(Pstar)-1) }
	u<-1-dist(Pstar,method="manhattan")/2
	v<-1-dist(P,method="manhattan")/2
	a<-sum((1-abs(u-v))uv)
	b<-sum(pmax(u-v,0))
	c<-sum(pmax(v-u,0))
	if(index=="jaccard"){
	if(a==0){ return(0) } #edge case where all clusters are singleton
	return(a/(a+b+c))
	} else if(index=="rand"){
	d<-sum((1-abs(u-v))(1-uv))
	return((a+d)/(a+b+c+d))
	}
	}

	#check agreement with existing packages for hard clusterings
	clusteval::cluster_similarity(w,y,"jaccard")
	cluster_similarity(W,Y,"jaccard")
	clusteval::cluster_similarity(w,y,"rand")
	cluster_similarity(W,Y,"rand")
	#check that similarity=1 when comparing something to itself
	cluster_similarity(X,X)==1
	cluster_similarity(X,X,"rand")==1
	#check symmetry
	cluster_similarity(X,Y)==cluster_similarity(Y,X)
	#jaccard=0 when all singleton clusters
	cluster_similarity(c(1,2,3),c(1,2,3))==0
	#check high value for hard clustering and very similar soft cl
	cluster_similarity(X,Y)
	cluster_similarity(X,Y,"rand")
	#check low value for two random soft clusterings
	cluster_similarity(X,Z)
	cluster_similarity(X,Z,"rand")