Skip to content

Instantly share code, notes, and snippets.

@wikii122
Last active May 22, 2016 14:37
Show Gist options
  • Save wikii122/49e0b62a58c4ff74b10ec1bceedba0df to your computer and use it in GitHub Desktop.
Save wikii122/49e0b62a58c4ff74b10ec1bceedba0df to your computer and use it in GitHub Desktop.
Clustering
############ Laboratory task ###################
#calculation of accuracy
accuracyCalc <- function(confTbl, startCol)
{
corr = 0;
for(i in startCol:ncol(confTbl))
{
corr = corr + max(confTbl[,i])
}
accuracy = corr/sum(confTbl)
accuracy
}
#data set for the laboratory task
#http://archive.ics.uci.edu/ml/datasets/Cardiotocography
download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_noClass_corr.csv','cardioto_noClass_corr.csv')
ctg_noClass <- read.csv("cardioto_noClass_corr.csv",row.names = 1)
download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_all_corr.csv','cardioto_all_corr.csv')
ctg_all <- read.csv("cardioto_all_corr.csv",row.names = 1)
#simplified example
distC = dist(ctg_noClass)
card.kmeans = kmeans(distC,10)
res3 = table(ctg_all$CLASS,card.kmeans$cluster )
res3
accuracyCalc(res3,1)
library(fpc)
library(cluster)
distance <- dist(ctg_noClass, method="euclidean")
fit <- kmeans(distance, centers=10)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #40
fit <- kmeans(distance, centers=12)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41
fit <- kmeans(distance, centers=8)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #39
fit <- kmeans(distance, centers=14)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43
fit <- kmeans(distance, centers=14, iter.max=50)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41
fit <- kmeans(distance, centers=14, iter.max=500)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.8
fit <- kmeans(distance, centers=14, iter.max=500, nstart=3)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.5
fit <- kmeans(distance, centers=14, iter.max=500, nstart=5)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #42.8
#########################
fit <- kmeans(distance, centers=10, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #40
fit <- kmeans(distance, centers=12, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43
fit <- kmeans(distance, centers=8, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #38
fit <- kmeans(distance, centers=14, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41
fit <- kmeans(distance, centers=14, iter.max=50, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43
fit <- kmeans(distance, centers=14, iter.max=500, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.3
fit <- kmeans(distance, centers=14, iter.max=500, nstart=3, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.1
fit <- kmeans(distance, centers=14, iter.max=500, nstart=5, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.2
fit <- kmeans(distance, centers=14, iter.max=1500, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.6
########################
clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, 10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36
clusterTree <- hclust(distance, hang=0.5)
clusters <- cutree(clusterTree, 10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36
clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, k=10, h=1)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36
clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, k=10, h=10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36
# However, it's deterministic!
###############
dd <- pamk(distance)
dd$nc # == 2
dd <- pamk(distance, k=10)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #41
dd <- pamk(distance, k=15, scaling=TRUE)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #46
dd <- pamk(distance, k=15, criterion="ch")
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #30
dd <- pamk(distance, k=15, usepam=FALSE)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #43
dd <- pamk(distance, k=15, scaling=TRUE, alpha=0.01)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #45
# Best result so far.
###########################
test <- dbscan(distance, eps=0.4)
res = table(ctg_all$CLASS, test$cluster)
accuracyCalc(res, 1) # 27
# Waste of time? Tried other params, no change
###########################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment