Skip to content

Instantly share code, notes, and snippets.

@sjackman
Last active December 14, 2015 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sjackman/5102584 to your computer and use it in GitHub Desktop.
Save sjackman/5102584 to your computer and use it in GitHub Desktop.
STAT 540 Seminar 9
# STAT 540 Seminar 9
# Shaun Jackman
# Data Preparation
library('RColorBrewer')
library('car')
library('lattice')
library('limma')
library('pvclust')
library('xtable')
prDat <- read.table("../data/photoRec/GSE4051_data.txt") # the whole enchilada
load("../data/photoRec/GSE4051_design.robj") # load exp design as 'prDes'
sprDat <- t(scale(t(prDat)))
# Sample Clustering
# Hierarchical clustering for photoRec data
# compute pairwise distances
pr.dis = dist(t(sprDat), method = "euclidean")
pr.nms = with(prDes, paste(gType, devStage, sep = "_"))
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# plot them
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
# identify 8 clusters
par(mfrow = c(1, 1))
plot(pr.hc.w, labels = pr.nms, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)
# Exercise: Play with the options of the heatmap function and compare the different heatmaps. Note that one can also use the original data prDat and set the option `scale="row". You will get the same heatmaps although the columns may be ordered diffently (use Colv=NA to suppress reordering).
heatmap(as.matrix(sprDat), Rowv = NA, Colv = NULL,
hclustfun = function(x) hclust(x, method = "ward"),
distfun = function(x) dist(x, method = "euclidean"),
scale = "none", labCol = pr.nms, labRow = NA, margin = c(8, 1),
ColSideColor = brewer.pal(11, "RdGy")[c(4, 7)][unclass(prDes$gType)])
# Partitioning methods for photoRec data
# K-means clustering
# Objects in columns
set.seed(31)
samplecenters <- 5
pr.km <- kmeans(t(sprDat), centers = samplecenters, nstart = 50)
# We can look at the within sum of squares of each cluster
pr.km$withinss
# We can look at the composition of each cluster
pr.kmTable = data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable = xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each cluster")
align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
# Repeate the analysis using a different seed and check if you get the same clusters.
# PAM algorithm
pr.pam <- pam(pr.dis, k = samplecenters)
split(as.vector(unlist(prDes$devStage)), pr.pam$clustering)
summary(pr.pam)
par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
# Excersise: draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis. Use the information obtained to determine if 5 was the best choice for the number of clusters.
averageSilhouetteWidth <- function(k) {
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
pr.pam <- pam(pr.dis, k = k)
return(pr.pam$silinfo$avg.width)
}
k <- 2:10
avg.width <- unlist(apply(as.matrix(k), 1, averageSilhouetteWidth))
xyplot(avg.width ~ k, data.frame(k=k, avg.width=avg.width),
main='Average silhouette width vs number of clusters')
cat('The optimal number of clusters is', k[which.min(avg.width)])
# Gene clustering
# A smaller dataset
# We start by using different clustering algorithms to cluster the top 972 genes that showed differential expression across the different develomental stage (BH adjusted pvalue < 10-5).
mm <- model.matrix(~devStage, prDes)
fit <- eBayes(lmFit(prDat, mm))
tt <- topTable(fit, n=Inf, sort.by='none',
coef = grep('devStage', colnames(mm)))
cat('Number of probes with an adjusted p-value < 1e-5:',
sum(tt$adj.P.Val < 1e-5))
topDat <- sprDat[tt$adj.P.Val < 1e-5,]
# Hierarchical:
geneC.dis = dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
xlab = "")
# Partitioning
set.seed(1234)
genecenters <- 5
kmeans.genes <- kmeans(topDat, centers = genecenters)
clusterNum <- 1
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
ylab = "Relative expression")
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)
# Or, probably more commonly used, we can see both dendograms using heatmaps (through hierarchical clustering):
heatmap(as.matrix(topDat), Rowv = NULL, Colv = NULL, hclustfun = function(x) hclust(x,
method = "average"), distfun = function(x) dist(x, method = "euclidean"),
labCol = pr.nms, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = brewer.pal(11,
"RdGy")[c(2, 4, 7, 9, 11)][unclass(prDes$devStage)])
# Redefining the attributes
oDat <- with(prDes, data.frame(sample, devStage, gType, probeset = factor(rep(rownames(topDat),
each = ncol(topDat))), geneExp = as.vector(unlist(t(topDat)))))
devStageAvg = with(oDat, tapply(geneExp, list(probeset, devStage), mean))
heatmap(as.matrix(devStageAvg), Rowv = NULL, Colv = NA, hclustfun = function(x) hclust(x,
method = "average"), distfun = function(x) dist(x, method = "euclidean"),
labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 1))
geneDS.km <- kmeans(devStageAvg, centers = 4, nstart = 50)
# Look at all clusters
par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
# Set up the axes without plotting; ylim set based on trial run.
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
sep = " "))
axis(2)
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(geneDS.km$centers[clusterNum, ], type = "l")
# Optional: points to show development stages.
points(geneDS.km$centers[clusterNum, ], pch = 20)
}
par(mfrow = c(1, 1))
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
ylab = "Average expression", axes = F, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
points(geneDS.km$centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
points(geneDS.km$centers[clusterNum, ], col = clusterNum, pch = 20)
}
cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")
# Statistical measures to evaluate clusters
pvc <- pvclust(topDat, nboot = 100)
plot(pvc, labels = pr.nms, cex = 0.6)
pvrect(pvc, alpha = 0.95)
# PCA (principal components analysis)
pcs <- prcomp(sprDat, center = F, scale = F)
plot(pcs)
# append the rotations for the first 10 PCs to the phenodata
prDes <- cbind(prDes, pcs$rotation[prDes$sample, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prDes[, 1:6], pch = 19, cex = 0.8)
# plot data on first two PCs, colored by development stage
plot(prDes[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
pt.bg = c(1, 2, 3, 4, 5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment