sjackman/stat540-seminar9.R

## stat540-seminar9.R
# STAT 540 Seminar 9
# Shaun Jackman

# Data Preparation
library('RColorBrewer')
library('car')
library('lattice')
library('limma')
library('pvclust')
library('xtable')
prDat <- read.table("../data/photoRec/GSE4051_data.txt")  # the whole enchilada
load("../data/photoRec/GSE4051_design.robj")  # load exp design as 'prDes'
sprDat <- t(scale(t(prDat)))

# Sample Clustering
# Hierarchical clustering for photoRec data
# compute pairwise distances
pr.dis = dist(t(sprDat), method = "euclidean")
pr.nms = with(prDes, paste(gType, devStage, sep = "_"))
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# plot them
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
# identify 8 clusters
par(mfrow = c(1, 1))
plot(pr.hc.w, labels = pr.nms, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)
# Exercise: Play with the options of the heatmap function and compare the different heatmaps. Note that one can also use the original data prDat and set the option `scale="row". You will get the same heatmaps although the columns may be ordered diffently (use Colv=NA to suppress reordering).
heatmap(as.matrix(sprDat), Rowv = NA, Colv = NULL,
	hclustfun = function(x) hclust(x, method = "ward"),
	distfun = function(x) dist(x, method = "euclidean"),
	scale = "none", labCol = pr.nms, labRow = NA, margin = c(8, 1),
	ColSideColor = brewer.pal(11, "RdGy")[c(4, 7)][unclass(prDes$gType)])

# Partitioning methods for photoRec data
# K-means clustering
# Objects in columns
set.seed(31)
samplecenters <- 5
pr.km <- kmeans(t(sprDat), centers = samplecenters, nstart = 50)
# We can look at the within sum of squares of each cluster
pr.km$withinss
# We can look at the composition of each cluster
pr.kmTable = data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable = xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each cluster")
align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
# Repeate the analysis using a different seed and check if you get the same clusters.

# PAM algorithm
pr.pam <- pam(pr.dis, k = samplecenters)
split(as.vector(unlist(prDes$devStage)), pr.pam$clustering)
summary(pr.pam)
par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
# Excersise: draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis. Use the information obtained to determine if 5 was the best choice for the number of clusters.
averageSilhouetteWidth <- function(k) {
	pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
	pr.pam <- pam(pr.dis, k = k)
	return(pr.pam$silinfo$avg.width)
}
k <- 2:10
avg.width <- unlist(apply(as.matrix(k), 1, averageSilhouetteWidth))
xyplot(avg.width ~ k, data.frame(k=k, avg.width=avg.width),
	main='Average silhouette width vs number of clusters')
cat('The optimal number of clusters is', k[which.min(avg.width)])

# Gene clustering
# A smaller dataset
# We start by using different clustering algorithms to cluster the top 972 genes that showed differential expression across the different develomental stage (BH adjusted pvalue < 10-5).
mm <- model.matrix(~devStage, prDes)
fit <- eBayes(lmFit(prDat, mm))
tt <- topTable(fit, n=Inf, sort.by='none',
	coef = grep('devStage', colnames(mm)))
cat('Number of probes with an adjusted p-value < 1e-5:',
	sum(tt$adj.P.Val < 1e-5))
topDat <- sprDat[tt$adj.P.Val < 1e-5,]

# Hierarchical:
geneC.dis = dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
	xlab = "")

# Partitioning
set.seed(1234)
genecenters <- 5
kmeans.genes <- kmeans(topDat, centers = genecenters)
clusterNum <- 1
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
	ylab = "Relative expression")
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)

# Or, probably more commonly used, we can see both dendograms using heatmaps (through hierarchical clustering):
heatmap(as.matrix(topDat), Rowv = NULL, Colv = NULL, hclustfun = function(x) hclust(x,
	method = "average"), distfun = function(x) dist(x, method = "euclidean"),
	labCol = pr.nms, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = brewer.pal(11,
		"RdGy")[c(2, 4, 7, 9, 11)][unclass(prDes$devStage)])

# Redefining the attributes
oDat <- with(prDes, data.frame(sample, devStage, gType, probeset = factor(rep(rownames(topDat),
	each = ncol(topDat))), geneExp = as.vector(unlist(t(topDat)))))
devStageAvg = with(oDat, tapply(geneExp, list(probeset, devStage), mean))
heatmap(as.matrix(devStageAvg), Rowv = NULL, Colv = NA, hclustfun = function(x) hclust(x,
	method = "average"), distfun = function(x) dist(x, method = "euclidean"),
	labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 1))
geneDS.km <- kmeans(devStageAvg, centers = 4, nstart = 50)
# Look at all clusters
par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    # Set up the axes without plotting; ylim set based on trial run.
    plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
        ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
            sep = " "))
    axis(2)
    axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
    # Plot the expression of all the genes in the selected cluster in grey.
    matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
    # Add the cluster center. This is last so it isn't underneath the members
    points(geneDS.km$centers[clusterNum, ], type = "l")
    # Optional: points to show development stages.
    points(geneDS.km$centers[clusterNum, ], pch = 20)
}
par(mfrow = c(1, 1))
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
    ylab = "Average expression", axes = F, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
    points(geneDS.km$centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
    points(geneDS.km$centers[clusterNum, ], col = clusterNum, pch = 20)
}
cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
	col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")

# Statistical measures to evaluate clusters
pvc <- pvclust(topDat, nboot = 100)
plot(pvc, labels = pr.nms, cex = 0.6)
pvrect(pvc, alpha = 0.95)

# PCA (principal components analysis)
pcs <- prcomp(sprDat, center = F, scale = F)
plot(pcs)
# append the rotations for the first 10 PCs to the phenodata
prDes <- cbind(prDes, pcs$rotation[prDes$sample, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prDes[, 1:6], pch = 19, cex = 0.8)
# plot data on first two PCs, colored by development stage
plot(prDes[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
	pt.bg = c(1, 2, 3, 4, 5))
	# STAT 540 Seminar 9
	# Shaun Jackman

	# Data Preparation
	library('RColorBrewer')
	library('car')
	library('lattice')
	library('limma')
	library('pvclust')
	library('xtable')
	prDat <- read.table("../data/photoRec/GSE4051_data.txt") # the whole enchilada
	load("../data/photoRec/GSE4051_design.robj") # load exp design as 'prDes'
	sprDat <- t(scale(t(prDat)))

	# Sample Clustering
	# Hierarchical clustering for photoRec data
	# compute pairwise distances
	pr.dis = dist(t(sprDat), method = "euclidean")
	pr.nms = with(prDes, paste(gType, devStage, sep = "_"))
	# compute hierarchical clustering using different linkage types
	pr.hc.s <- hclust(pr.dis, method = "single")
	pr.hc.c <- hclust(pr.dis, method = "complete")
	pr.hc.a <- hclust(pr.dis, method = "average")
	pr.hc.w <- hclust(pr.dis, method = "ward")
	# plot them
	par(mar = c(0, 4, 4, 2))
	par(mfrow = c(2, 2))
	plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
	plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
	plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
	plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
	# identify 8 clusters
	par(mfrow = c(1, 1))
	plot(pr.hc.w, labels = pr.nms, cex = 0.6, main = "Ward showing 10 clusters")
	rect.hclust(pr.hc.w, k = 10)
	# Exercise: Play with the options of the heatmap function and compare the different heatmaps. Note that one can also use the original data prDat and set the option `scale="row". You will get the same heatmaps although the columns may be ordered diffently (use Colv=NA to suppress reordering).
	heatmap(as.matrix(sprDat), Rowv = NA, Colv = NULL,
	hclustfun = function(x) hclust(x, method = "ward"),
	distfun = function(x) dist(x, method = "euclidean"),
	scale = "none", labCol = pr.nms, labRow = NA, margin = c(8, 1),
	ColSideColor = brewer.pal(11, "RdGy")[c(4, 7)][unclass(prDes$gType)])

	# Partitioning methods for photoRec data
	# K-means clustering
	# Objects in columns
	set.seed(31)
	samplecenters <- 5
	pr.km <- kmeans(t(sprDat), centers = samplecenters, nstart = 50)
	# We can look at the within sum of squares of each cluster
	pr.km$withinss
	# We can look at the composition of each cluster
	pr.kmTable = data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
	prTable = xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each cluster")
	align(prTable) <- "lccccc"
	print(prTable, type = "html", caption.placement = "top")
	# Repeate the analysis using a different seed and check if you get the same clusters.

	# PAM algorithm
	pr.pam <- pam(pr.dis, k = samplecenters)
	split(as.vector(unlist(prDes$devStage)), pr.pam$clustering)
	summary(pr.pam)
	par(mar = c(5, 1, 4, 4))
	plot(pr.pam, main = "Silhouette Plot for 5 clusters")
	# Excersise: draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis. Use the information obtained to determine if 5 was the best choice for the number of clusters.
	averageSilhouetteWidth <- function(k) {
	pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
	pr.pam <- pam(pr.dis, k = k)
	return(pr.pam$silinfo$avg.width)
	}
	k <- 2:10
	avg.width <- unlist(apply(as.matrix(k), 1, averageSilhouetteWidth))
	xyplot(avg.width ~ k, data.frame(k=k, avg.width=avg.width),
	main='Average silhouette width vs number of clusters')
	cat('The optimal number of clusters is', k[which.min(avg.width)])

	# Gene clustering
	# A smaller dataset
	# We start by using different clustering algorithms to cluster the top 972 genes that showed differential expression across the different develomental stage (BH adjusted pvalue < 10-5).
	mm <- model.matrix(~devStage, prDes)
	fit <- eBayes(lmFit(prDat, mm))
	tt <- topTable(fit, n=Inf, sort.by='none',
	coef = grep('devStage', colnames(mm)))
	cat('Number of probes with an adjusted p-value < 1e-5:',
	sum(tt$adj.P.Val < 1e-5))
	topDat <- sprDat[tt$adj.P.Val < 1e-5,]

	# Hierarchical:
	geneC.dis = dist(topDat, method = "euclidean")
	geneC.hc.a <- hclust(geneC.dis, method = "average")
	plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
	xlab = "")

	# Partitioning
	set.seed(1234)
	genecenters <- 5
	kmeans.genes <- kmeans(topDat, centers = genecenters)
	clusterNum <- 1
	plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
	ylab = "Relative expression")
	matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
	points(kmeans.genes$centers[clusterNum, ], type = "l")
	points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)

	# Or, probably more commonly used, we can see both dendograms using heatmaps (through hierarchical clustering):
	heatmap(as.matrix(topDat), Rowv = NULL, Colv = NULL, hclustfun = function(x) hclust(x,
	method = "average"), distfun = function(x) dist(x, method = "euclidean"),
	labCol = pr.nms, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = brewer.pal(11,
	"RdGy")[c(2, 4, 7, 9, 11)][unclass(prDes$devStage)])

	# Redefining the attributes
	oDat <- with(prDes, data.frame(sample, devStage, gType, probeset = factor(rep(rownames(topDat),
	each = ncol(topDat))), geneExp = as.vector(unlist(t(topDat)))))
	devStageAvg = with(oDat, tapply(geneExp, list(probeset, devStage), mean))
	heatmap(as.matrix(devStageAvg), Rowv = NULL, Colv = NA, hclustfun = function(x) hclust(x,
	method = "average"), distfun = function(x) dist(x, method = "euclidean"),
	labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 1))
	geneDS.km <- kmeans(devStageAvg, centers = 4, nstart = 50)
	# Look at all clusters
	par(mfrow = c(2, 2))
	for (clusterNum in 1:4) {
	# Set up the axes without plotting; ylim set based on trial run.
	plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
	ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
	sep = " "))
	axis(2)
	axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
	# Plot the expression of all the genes in the selected cluster in grey.
	matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
	# Add the cluster center. This is last so it isn't underneath the members
	points(geneDS.km$centers[clusterNum, ], type = "l")
	# Optional: points to show development stages.
	points(geneDS.km$centers[clusterNum, ], pch = 20)
	}
	par(mfrow = c(1, 1))
	plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
	ylab = "Average expression", axes = F, main = "Clusters centers")
	axis(2)
	axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
	for (clusterNum in 1:4) {
	points(geneDS.km$centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
	points(geneDS.km$centers[clusterNum, ], col = clusterNum, pch = 20)
	}
	cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
	col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")

	# Statistical measures to evaluate clusters
	pvc <- pvclust(topDat, nboot = 100)
	plot(pvc, labels = pr.nms, cex = 0.6)
	pvrect(pvc, alpha = 0.95)

	# PCA (principal components analysis)
	pcs <- prcomp(sprDat, center = F, scale = F)
	plot(pcs)
	# append the rotations for the first 10 PCs to the phenodata
	prDes <- cbind(prDes, pcs$rotation[prDes$sample, 1:10])
	# scatter plot showing us how the first few PCs relate to covariates
	plot(prDes[, 1:6], pch = 19, cex = 0.8)
	# plot data on first two PCs, colored by development stage
	plot(prDes[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
	legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
	pt.bg = c(1, 2, 3, 4, 5))