wikii122/edami7.r

## edami7.r
############ Laboratory task ###################
#calculation of accuracy
accuracyCalc <- function(confTbl, startCol)
{
  corr = 0;
  for(i in startCol:ncol(confTbl))
  {
    corr = corr + max(confTbl[,i])
  }
  accuracy = corr/sum(confTbl)
  accuracy
}

#data set for the laboratory task
#http://archive.ics.uci.edu/ml/datasets/Cardiotocography

download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_noClass_corr.csv','cardioto_noClass_corr.csv')
ctg_noClass <- read.csv("cardioto_noClass_corr.csv",row.names = 1)

download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_all_corr.csv','cardioto_all_corr.csv')
ctg_all <- read.csv("cardioto_all_corr.csv",row.names = 1)


#simplified example
distC = dist(ctg_noClass)
card.kmeans = kmeans(distC,10)
res3 = table(ctg_all$CLASS,card.kmeans$cluster )
res3
accuracyCalc(res3,1)

library(fpc)
library(cluster)

distance <- dist(ctg_noClass, method="euclidean")

fit <- kmeans(distance, centers=10)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #40

fit <- kmeans(distance, centers=12)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41

fit <- kmeans(distance, centers=8)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #39

fit <- kmeans(distance, centers=14)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43

fit <- kmeans(distance, centers=14, iter.max=50)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41

fit <- kmeans(distance, centers=14, iter.max=500)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.8

fit <- kmeans(distance, centers=14, iter.max=500, nstart=3)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.5

fit <- kmeans(distance, centers=14, iter.max=500, nstart=5)
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #42.8

#########################

fit <- kmeans(distance, centers=10, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #40

fit <- kmeans(distance, centers=12, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43

fit <- kmeans(distance, centers=8, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #38

fit <- kmeans(distance, centers=14, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #41

fit <- kmeans(distance, centers=14, iter.max=50, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43

fit <- kmeans(distance, centers=14, iter.max=500, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.3

fit <- kmeans(distance, centers=14, iter.max=500, nstart=3, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #43.1

fit <- kmeans(distance, centers=14, iter.max=500, nstart=5, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.2

fit <- kmeans(distance, centers=14, iter.max=1500, algorithm="Forgy")
res = table(ctg_all$CLASS, fit$cluster)
accuracyCalc(res, 1) #44.6

########################

clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, 10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36

clusterTree <- hclust(distance, hang=0.5)
clusters <- cutree(clusterTree, 10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36

clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, k=10, h=1)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36

clusterTree <- hclust(distance)
clusters <- cutree(clusterTree, k=10, h=10)
res = table(ctg_all$CLASS, clusters)
accuracyCalc(res, 1) #36

# However, it's deterministic!

###############
dd <- pamk(distance)
dd$nc # == 2

dd <- pamk(distance, k=10)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #41

dd <- pamk(distance, k=15, scaling=TRUE)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #46

dd <- pamk(distance, k=15, criterion="ch")
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #30

dd <- pamk(distance, k=15, usepam=FALSE)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #43

dd <- pamk(distance, k=15, scaling=TRUE, alpha=0.01)
res = table(ctg_all$CLASS, dd$pamobject$clustering)
accuracyCalc(res, 1) #45
# Best result so far.


###########################
test <- dbscan(distance, eps=0.4)
res = table(ctg_all$CLASS, test$cluster)
accuracyCalc(res, 1) # 27
# Waste of time? Tried other params, no change

###########################
	############ Laboratory task ###################
	#calculation of accuracy
	accuracyCalc <- function(confTbl, startCol)
	{
	corr = 0;
	for(i in startCol:ncol(confTbl))
	{
	corr = corr + max(confTbl[,i])
	}
	accuracy = corr/sum(confTbl)
	accuracy
	}

	#data set for the laboratory task
	#http://archive.ics.uci.edu/ml/datasets/Cardiotocography

	download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_noClass_corr.csv','cardioto_noClass_corr.csv')
	ctg_noClass <- read.csv("cardioto_noClass_corr.csv",row.names = 1)

	download.file('http://staff.ii.pw.edu.pl/~gprotazi/dydaktyka/dane/cardioto_all_corr.csv','cardioto_all_corr.csv')
	ctg_all <- read.csv("cardioto_all_corr.csv",row.names = 1)


	#simplified example
	distC = dist(ctg_noClass)
	card.kmeans = kmeans(distC,10)
	res3 = table(ctg_all$CLASS,card.kmeans$cluster )
	res3
	accuracyCalc(res3,1)

	library(fpc)
	library(cluster)

	distance <- dist(ctg_noClass, method="euclidean")

	fit <- kmeans(distance, centers=10)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #40

	fit <- kmeans(distance, centers=12)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #41

	fit <- kmeans(distance, centers=8)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #39

	fit <- kmeans(distance, centers=14)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43

	fit <- kmeans(distance, centers=14, iter.max=50)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #41

	fit <- kmeans(distance, centers=14, iter.max=500)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43.8

	fit <- kmeans(distance, centers=14, iter.max=500, nstart=3)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #44.5

	fit <- kmeans(distance, centers=14, iter.max=500, nstart=5)
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #42.8

	#########################

	fit <- kmeans(distance, centers=10, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #40

	fit <- kmeans(distance, centers=12, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43

	fit <- kmeans(distance, centers=8, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #38

	fit <- kmeans(distance, centers=14, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #41

	fit <- kmeans(distance, centers=14, iter.max=50, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43

	fit <- kmeans(distance, centers=14, iter.max=500, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43.3

	fit <- kmeans(distance, centers=14, iter.max=500, nstart=3, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #43.1

	fit <- kmeans(distance, centers=14, iter.max=500, nstart=5, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #44.2

	fit <- kmeans(distance, centers=14, iter.max=1500, algorithm="Forgy")
	res = table(ctg_all$CLASS, fit$cluster)
	accuracyCalc(res, 1) #44.6

	########################

	clusterTree <- hclust(distance)
	clusters <- cutree(clusterTree, 10)
	res = table(ctg_all$CLASS, clusters)
	accuracyCalc(res, 1) #36

	clusterTree <- hclust(distance, hang=0.5)
	clusters <- cutree(clusterTree, 10)
	res = table(ctg_all$CLASS, clusters)
	accuracyCalc(res, 1) #36

	clusterTree <- hclust(distance)
	clusters <- cutree(clusterTree, k=10, h=1)
	res = table(ctg_all$CLASS, clusters)
	accuracyCalc(res, 1) #36

	clusterTree <- hclust(distance)
	clusters <- cutree(clusterTree, k=10, h=10)
	res = table(ctg_all$CLASS, clusters)
	accuracyCalc(res, 1) #36

	# However, it's deterministic!

	###############
	dd <- pamk(distance)
	dd$nc # == 2

	dd <- pamk(distance, k=10)
	res = table(ctg_all$CLASS, dd$pamobject$clustering)
	accuracyCalc(res, 1) #41

	dd <- pamk(distance, k=15, scaling=TRUE)
	res = table(ctg_all$CLASS, dd$pamobject$clustering)
	accuracyCalc(res, 1) #46

	dd <- pamk(distance, k=15, criterion="ch")
	res = table(ctg_all$CLASS, dd$pamobject$clustering)
	accuracyCalc(res, 1) #30

	dd <- pamk(distance, k=15, usepam=FALSE)
	res = table(ctg_all$CLASS, dd$pamobject$clustering)
	accuracyCalc(res, 1) #43

	dd <- pamk(distance, k=15, scaling=TRUE, alpha=0.01)
	res = table(ctg_all$CLASS, dd$pamobject$clustering)
	accuracyCalc(res, 1) #45
	# Best result so far.


	###########################
	test <- dbscan(distance, eps=0.4)
	res = table(ctg_all$CLASS, test$cluster)
	accuracyCalc(res, 1) # 27
	# Waste of time? Tried other params, no change

	###########################