Last active
December 14, 2015 14:39
-
-
Save sjackman/5102584 to your computer and use it in GitHub Desktop.
STAT 540 Seminar 9
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# STAT 540 Seminar 9 | |
# Shaun Jackman | |
# Data Preparation | |
library('RColorBrewer') | |
library('car') | |
library('lattice') | |
library('limma') | |
library('pvclust') | |
library('xtable') | |
prDat <- read.table("../data/photoRec/GSE4051_data.txt") # the whole enchilada | |
load("../data/photoRec/GSE4051_design.robj") # load exp design as 'prDes' | |
sprDat <- t(scale(t(prDat))) | |
# Sample Clustering | |
# Hierarchical clustering for photoRec data | |
# compute pairwise distances | |
pr.dis = dist(t(sprDat), method = "euclidean") | |
pr.nms = with(prDes, paste(gType, devStage, sep = "_")) | |
# compute hierarchical clustering using different linkage types | |
pr.hc.s <- hclust(pr.dis, method = "single") | |
pr.hc.c <- hclust(pr.dis, method = "complete") | |
pr.hc.a <- hclust(pr.dis, method = "average") | |
pr.hc.w <- hclust(pr.dis, method = "ward") | |
# plot them | |
par(mar = c(0, 4, 4, 2)) | |
par(mfrow = c(2, 2)) | |
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "") | |
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "") | |
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "") | |
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "") | |
# identify 8 clusters | |
par(mfrow = c(1, 1)) | |
plot(pr.hc.w, labels = pr.nms, cex = 0.6, main = "Ward showing 10 clusters") | |
rect.hclust(pr.hc.w, k = 10) | |
# Exercise: Play with the options of the heatmap function and compare the different heatmaps. Note that one can also use the original data prDat and set the option `scale="row". You will get the same heatmaps although the columns may be ordered diffently (use Colv=NA to suppress reordering). | |
heatmap(as.matrix(sprDat), Rowv = NA, Colv = NULL, | |
hclustfun = function(x) hclust(x, method = "ward"), | |
distfun = function(x) dist(x, method = "euclidean"), | |
scale = "none", labCol = pr.nms, labRow = NA, margin = c(8, 1), | |
ColSideColor = brewer.pal(11, "RdGy")[c(4, 7)][unclass(prDes$gType)]) | |
# Partitioning methods for photoRec data | |
# K-means clustering | |
# Objects in columns | |
set.seed(31) | |
samplecenters <- 5 | |
pr.km <- kmeans(t(sprDat), centers = samplecenters, nstart = 50) | |
# We can look at the within sum of squares of each cluster | |
pr.km$withinss | |
# We can look at the composition of each cluster | |
pr.kmTable = data.frame(devStage = prDes$devStage, cluster = pr.km$cluster) | |
prTable = xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each cluster") | |
align(prTable) <- "lccccc" | |
print(prTable, type = "html", caption.placement = "top") | |
# Repeate the analysis using a different seed and check if you get the same clusters. | |
# PAM algorithm | |
pr.pam <- pam(pr.dis, k = samplecenters) | |
split(as.vector(unlist(prDes$devStage)), pr.pam$clustering) | |
summary(pr.pam) | |
par(mar = c(5, 1, 4, 4)) | |
plot(pr.pam, main = "Silhouette Plot for 5 clusters") | |
# Excersise: draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis. Use the information obtained to determine if 5 was the best choice for the number of clusters. | |
averageSilhouetteWidth <- function(k) { | |
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50) | |
pr.pam <- pam(pr.dis, k = k) | |
return(pr.pam$silinfo$avg.width) | |
} | |
k <- 2:10 | |
avg.width <- unlist(apply(as.matrix(k), 1, averageSilhouetteWidth)) | |
xyplot(avg.width ~ k, data.frame(k=k, avg.width=avg.width), | |
main='Average silhouette width vs number of clusters') | |
cat('The optimal number of clusters is', k[which.min(avg.width)]) | |
# Gene clustering | |
# A smaller dataset | |
# We start by using different clustering algorithms to cluster the top 972 genes that showed differential expression across the different develomental stage (BH adjusted pvalue < 10-5). | |
mm <- model.matrix(~devStage, prDes) | |
fit <- eBayes(lmFit(prDat, mm)) | |
tt <- topTable(fit, n=Inf, sort.by='none', | |
coef = grep('devStage', colnames(mm))) | |
cat('Number of probes with an adjusted p-value < 1e-5:', | |
sum(tt$adj.P.Val < 1e-5)) | |
topDat <- sprDat[tt$adj.P.Val < 1e-5,] | |
# Hierarchical: | |
geneC.dis = dist(topDat, method = "euclidean") | |
geneC.hc.a <- hclust(geneC.dis, method = "average") | |
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage", | |
xlab = "") | |
# Partitioning | |
set.seed(1234) | |
genecenters <- 5 | |
kmeans.genes <- kmeans(topDat, centers = genecenters) | |
clusterNum <- 1 | |
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples", | |
ylab = "Relative expression") | |
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey") | |
points(kmeans.genes$centers[clusterNum, ], type = "l") | |
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20) | |
# Or, probably more commonly used, we can see both dendograms using heatmaps (through hierarchical clustering): | |
heatmap(as.matrix(topDat), Rowv = NULL, Colv = NULL, hclustfun = function(x) hclust(x, | |
method = "average"), distfun = function(x) dist(x, method = "euclidean"), | |
labCol = pr.nms, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = brewer.pal(11, | |
"RdGy")[c(2, 4, 7, 9, 11)][unclass(prDes$devStage)]) | |
# Redefining the attributes | |
oDat <- with(prDes, data.frame(sample, devStage, gType, probeset = factor(rep(rownames(topDat), | |
each = ncol(topDat))), geneExp = as.vector(unlist(t(topDat))))) | |
devStageAvg = with(oDat, tapply(geneExp, list(probeset, devStage), mean)) | |
heatmap(as.matrix(devStageAvg), Rowv = NULL, Colv = NA, hclustfun = function(x) hclust(x, | |
method = "average"), distfun = function(x) dist(x, method = "euclidean"), | |
labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 1)) | |
geneDS.km <- kmeans(devStageAvg, centers = 4, nstart = 50) | |
# Look at all clusters | |
par(mfrow = c(2, 2)) | |
for (clusterNum in 1:4) { | |
# Set up the axes without plotting; ylim set based on trial run. | |
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage", | |
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum, | |
sep = " ")) | |
axis(2) | |
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9) | |
# Plot the expression of all the genes in the selected cluster in grey. | |
matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey") | |
# Add the cluster center. This is last so it isn't underneath the members | |
points(geneDS.km$centers[clusterNum, ], type = "l") | |
# Optional: points to show development stages. | |
points(geneDS.km$centers[clusterNum, ], pch = 20) | |
} | |
par(mfrow = c(1, 1)) | |
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage", | |
ylab = "Average expression", axes = F, main = "Clusters centers") | |
axis(2) | |
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9) | |
for (clusterNum in 1:4) { | |
points(geneDS.km$centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2) | |
points(geneDS.km$centers[clusterNum, ], col = clusterNum, pch = 20) | |
} | |
cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"], | |
col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks") | |
# Statistical measures to evaluate clusters | |
pvc <- pvclust(topDat, nboot = 100) | |
plot(pvc, labels = pr.nms, cex = 0.6) | |
pvrect(pvc, alpha = 0.95) | |
# PCA (principal components analysis) | |
pcs <- prcomp(sprDat, center = F, scale = F) | |
plot(pcs) | |
# append the rotations for the first 10 PCs to the phenodata | |
prDes <- cbind(prDes, pcs$rotation[prDes$sample, 1:10]) | |
# scatter plot showing us how the first few PCs relate to covariates | |
plot(prDes[, 1:6], pch = 19, cex = 0.8) | |
# plot data on first two PCs, colored by development stage | |
plot(prDes[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5) | |
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21, | |
pt.bg = c(1, 2, 3, 4, 5)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment