stevenworthington/k_medoids_uncent_corr.R

## k_medoids_uncent_corr.R
# example of calculating K-medoids using the uncentered
# correlation metric as a measure of distance


# 0) load data
data(mtcars)


# 1) create a distance matrix using the "cosine of the angle" method (aka, uncentered correlation)

# a) using the designdist() function in the vegan package

# install.packages("vegan", dep = TRUE)
library(vegan)
distMat1 <- designdist(mtcars, method = "1-J/sqrt(A*B)", terms = "quadratic",
                       name = "cosine.complement")

# b) using the distancematrix() function in the hopack package

# source("http://bioconductor.org/biocLite.R")
# biocLite("hopach")
library(hopach)
distMat2 <- as.matrix( distancematrix(mtcars, d = "cosangle", na.rm = TRUE) )


# 2) implement partitioning clustering using the distance matrix

# install.packages("cluster", dep = TRUE)
library(cluster)
clusterObject1 <- pam(distMat1, k = 5, diss = TRUE, keep.diss = TRUE)
clusterObject2 <- pam(distMat2, k = 5, diss = TRUE, keep.diss = TRUE)


# 3) assign cluster IDs to data
mtcars$clusters1 <- clusterObject1$clustering
mtcars$clusters2 <- clusterObject2$clustering


# 4) view cluster IDs
options(width = 160)
mtcars


# 5) visualize clusters
# install.packages("fpc", dep = TRUE)
library(fpc)

clusplot(clusterObject1, color = TRUE, shade = TRUE, labels = 2, lines = 0, cex = 0.7,
    main = "Principal Components Analysis of k-medoids Partitions")


# 6) visualize distance matrix

# a) dimension reduction, via principal components analysis
pca <- prcomp(distMat1)
pca$x[duplicated(pca$x[, 1]), 1] # show duplicate values

# b) dimension reduction, via isometric feature mapping ordination
library(vegan)
iso <- isomap(distMat1, k = 10) # may need to adjust the value of k if data are fragmented

# c) dimension reduction, via metric multidimensional scaling
mds <- cmdscale(d = distMat1, k = 2)

# d) plot the Voronoi tessellation
# install.packages("tripack", dep = TRUE)
library(tripack)

# pca
plot( voronoi.mosaic(pca$x[, 1], pca$x[, 2], duplicate = "remove") )
points(pca$x, pch = 13, col = "red")
text(pca$x, labels = rownames(pca$x), pos = 3, offset = 0.5, cex = 0.7, col = "red")

# isomap
plot( voronoi.mosaic(iso$points[, 1], iso$points[, 2], duplicate = "remove") )
points(iso$points, pch = 13, col = "red")
text(iso$points, labels = rownames(iso$points), pos = 3, offset = 0.5, cex = 0.7, col = "red")

# mds
plot( voronoi.mosaic(mds[, 1], mds[, 2], duplicate = "remove") )
points(mds, pch = 13, col = "red")
text(mds, labels = rownames(mds), pos = 3, offset = 0.5, cex = 0.7, col = "red")


# 7) cluster using Euclidean and Mahalanobis distances

a) Euclidean distance

# either directly:
k_Euclid <- pam(mtcars, 5, metric = "euclidean")

# or getting the distance matrix separately and then feeding it to pam()
Euclid_mat <- daisy(mtcars, metric = "euclidean")
k_Euclid2 <- pam(Euclid_mat, 5, diss = TRUE)


# b) Mahalanobis distance

# install.packages("HDMD", dep = TRUE)
library(HDMD)
Mahal <- pairwise.mahalanobis(mtcars, grouping = rownames(mtcars), cov(mtcars))

library(cluster)
k_Mahal <- pam(Mahal$distance, k = 5, diss = TRUE, keep.diss = TRUE)
	# example of calculating K-medoids using the uncentered
	# correlation metric as a measure of distance


	# 0) load data
	data(mtcars)


	# 1) create a distance matrix using the "cosine of the angle" method (aka, uncentered correlation)

	# a) using the designdist() function in the vegan package

	# install.packages("vegan", dep = TRUE)
	library(vegan)
	distMat1 <- designdist(mtcars, method = "1-J/sqrt(A*B)", terms = "quadratic",
	name = "cosine.complement")

	# b) using the distancematrix() function in the hopack package

	# source("http://bioconductor.org/biocLite.R")
	# biocLite("hopach")
	library(hopach)
	distMat2 <- as.matrix( distancematrix(mtcars, d = "cosangle", na.rm = TRUE) )


	# 2) implement partitioning clustering using the distance matrix

	# install.packages("cluster", dep = TRUE)
	library(cluster)
	clusterObject1 <- pam(distMat1, k = 5, diss = TRUE, keep.diss = TRUE)
	clusterObject2 <- pam(distMat2, k = 5, diss = TRUE, keep.diss = TRUE)


	# 3) assign cluster IDs to data
	mtcars$clusters1 <- clusterObject1$clustering
	mtcars$clusters2 <- clusterObject2$clustering


	# 4) view cluster IDs
	options(width = 160)
	mtcars


	# 5) visualize clusters
	# install.packages("fpc", dep = TRUE)
	library(fpc)

	clusplot(clusterObject1, color = TRUE, shade = TRUE, labels = 2, lines = 0, cex = 0.7,
	main = "Principal Components Analysis of k-medoids Partitions")


	# 6) visualize distance matrix

	# a) dimension reduction, via principal components analysis
	pca <- prcomp(distMat1)
	pca$x[duplicated(pca$x[, 1]), 1] # show duplicate values

	# b) dimension reduction, via isometric feature mapping ordination
	library(vegan)
	iso <- isomap(distMat1, k = 10) # may need to adjust the value of k if data are fragmented

	# c) dimension reduction, via metric multidimensional scaling
	mds <- cmdscale(d = distMat1, k = 2)

	# d) plot the Voronoi tessellation
	# install.packages("tripack", dep = TRUE)
	library(tripack)

	# pca
	plot( voronoi.mosaic(pca$x[, 1], pca$x[, 2], duplicate = "remove") )
	points(pca$x, pch = 13, col = "red")
	text(pca$x, labels = rownames(pca$x), pos = 3, offset = 0.5, cex = 0.7, col = "red")

	# isomap
	plot( voronoi.mosaic(iso$points[, 1], iso$points[, 2], duplicate = "remove") )
	points(iso$points, pch = 13, col = "red")
	text(iso$points, labels = rownames(iso$points), pos = 3, offset = 0.5, cex = 0.7, col = "red")

	# mds
	plot( voronoi.mosaic(mds[, 1], mds[, 2], duplicate = "remove") )
	points(mds, pch = 13, col = "red")
	text(mds, labels = rownames(mds), pos = 3, offset = 0.5, cex = 0.7, col = "red")


	# 7) cluster using Euclidean and Mahalanobis distances

	a) Euclidean distance

	# either directly:
	k_Euclid <- pam(mtcars, 5, metric = "euclidean")

	# or getting the distance matrix separately and then feeding it to pam()
	Euclid_mat <- daisy(mtcars, metric = "euclidean")
	k_Euclid2 <- pam(Euclid_mat, 5, diss = TRUE)


	# b) Mahalanobis distance

	# install.packages("HDMD", dep = TRUE)
	library(HDMD)
	Mahal <- pairwise.mahalanobis(mtcars, grouping = rownames(mtcars), cov(mtcars))

	library(cluster)
	k_Mahal <- pam(Mahal$distance, k = 5, diss = TRUE, keep.diss = TRUE)