Skip to content

Instantly share code, notes, and snippets.

@j-andrews7
Created August 24, 2018 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-andrews7/f387e83e6e2f84bbdb71589f95521607 to your computer and use it in GitHub Desktop.
Save j-andrews7/f387e83e6e2f84bbdb71589f95521607 to your computer and use it in GitHub Desktop.
Making heatmaps
make.heatmaps <- function(dds, res, rld, level, g1, g2, plot_annos=NULL) {
# plot_annos should be a vector of at least two columns from the sample sheet to use for labeling plots.
# g1 and g2 should be strings for the groups to be compared for the level (eg. "Progression", "Response")
# dds must be a DESeqExperiment object.
# rld is the log-transformed dds object.
## Should really make another generic function for the actual heatmap plots and feed it in the pval/lfc values.
# Set color breaks and palatte.
breaks <- c(seq(-3,-1.251,length=250),seq(-1.25,-0.1001,length=250),seq(-0.1,0.1,length=1),seq(0.1001,1.25,length=250),seq(1.251,3,length=250))
colors <- colorRampPalette(c("#67001f","#b2182b", "#f5f5f5", "#2166ac","#053061"))(n = 1000)
# Get the DEGs.
resSig <- subset(res, padj <= 0.1)
resSig <- resSig[order(resSig$padj),]
res100 <- row.names(head(resSig, 100))
x <- assay(rld)
matrix <- x[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 - RLD", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 - RLD", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
matrix <- x[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj < 0.1)
resDEG <- row.names(resSig)
x <- assay(rld)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the same thing but only with the wanted samples.
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
if (!(ncol(x) == ncol(x.sub))) {
matrix <- x.sub[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 - RLD", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 - RLD", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj < 0.1)
resDEG <- row.names(resSig)
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
}
### Now let's throw in a LFC magnitude threshold.
resSig <- subset(res, padj <= 0.1)
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
resSig <- resSig[order(resSig$padj),]
res100 <- row.names(head(resSig, 100))
x <- assay(rld)
matrix <- x[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
matrix <- x[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 & LFC >|< 2", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.1 & LFC >|< 2", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resDEG <- row.names(resSig)
x <- assay(rld)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the same thing but only with the wanted samples.
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
if (!(ncol(x) == ncol(x.sub))) {
matrix <- x.sub[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj <= 0.1)
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
resDEG <- row.names(resSig)
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.1 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
}
### PADJ <= 0.05 ###
# Get the variance-stabilized counts.
resSig <- subset(res, padj <= 0.05)
resSig <- resSig[order(resSig$padj),]
res100 <- row.names(head(resSig, 100))
x <- assay(rld)
matrix <- x[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 - RLD", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 - RLD", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
matrix <- x[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj < 0.05)
resDEG <- row.names(resSig)
x <- assay(rld)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the same thing but only with the wanted samples.
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
if (!(ncol(x) == ncol(x.sub))) {
matrix <- x.sub[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 - RLD", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 - RLD", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj < 0.05)
resDEG <- row.names(resSig)
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
}
### Now let's throw in a LFC magnitude threshold.
resSig <- subset(res, padj <= 0.05)
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
resSig <- resSig[order(resSig$padj),]
res100 <- row.names(head(resSig, 100))
x <- assay(rld)
matrix <- x[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
matrix <- x[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 & LFC >|< 2", fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main="Top 100 DE Genes - padj <= 0.05 & LFC >|< 2", cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resDEG <- row.names(resSig)
x <- assay(rld)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
matrix <- x[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the same thing but only with the wanted samples.
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
if (!(ncol(x) == ncol(x.sub))) {
matrix <- x.sub[res100,]
# Set which columns we want to use for annotating samples.
annotation_data <- as.data.frame(colData(rld)[plot_annos])
# Plot the top 100.
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now just get the normalized counts and change the color breaks.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[res100,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), fontsize_row=4, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = T,
main=paste("Top 100 DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=4, fontsize_col=5)
# Now heatmaps for all DEGs with the variance-stabilized counts.
resSig <- subset(res, padj <= 0.05)
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
resDEG <- row.names(resSig)
x <- assay(rld)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2 - RLD", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
# Now the normalized counts for all DEGs.
x <- assay(dds)
x.sub <- x[, colData(rld)[,level] %in% c(g1, g2)]
matrix <- x.sub[resDEG,]
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), fontsize_row=3, fontsize_col=5)
pheatmap(matrix, annotation_col=annotation_data, col=colors, scale="row", breaks=breaks, show_rownames = F,
main=paste("ALL DE Genes - padj <= 0.05 & LFC >|< 2", sep=''), cluster_cols=F, fontsize_row=3, fontsize_col=5)
}
}
run.DESeq2 <- function(outpath, quantspath, samplesheet, tx2gene, level, g1, g2, plot_annos=NULL, block=NULL) {
# plot_annos should be a vector of at least two columns from the sample sheet to use for labeling plots.
# g1 and g2 should be strings for the groups to be compared for the level (eg. "Progression", "Response")
# block can either be a single string ("cells") or a vector of multiple (c("cells", "disease"))
message("### EXPLORATORY DATA ANALYSIS ###\n")
message("# SET DIRECTORY STRUCTURE AND MODEL DESIGN #\n")
### SET DIRECTORY STRUCTURE AND MODEL DESIGN ###
# Base folder to create output folders in. Create output folders.
base <- outpath
# Determine if there is a blocking factor or not and set design.
if (!is.null(block)) {
design <- formula(paste("~", paste(c(block, level), sep='', collapse=" + ")))
if (!dir.exists(file.path(base,paste(c(block,"Block"), sep='', collapse='')))) {
dir.create(file.path(base,paste(c(block,"Block"), sep='', collapse='')))
}
if (!dir.exists(file.path(base,paste(c(block,"Block/GenericFigures"), sep='', collapse='')))) {
dir.create(file.path(base,paste(c(block,"Block/GenericFigures"), sep='', collapse='')))
}
if (!dir.exists(file.path(base,paste(c(block,"Block/GeneBoxPlots"), sep='', collapse='')))) {
dir.create(file.path(base,paste(c(block,"Block/GeneBoxPlots"), sep='', collapse='')))
}
base = file.path(base, paste(c(block,"Block"),sep='', collapse=''))
} else {
design <- formula(paste("~", level))
if (!dir.exists(file.path(base,"NoBlock"))) {
dir.create(file.path(base,"NoBlock"))
}
if (!dir.exists(file.path(base,"NoBlock/GenericFigures"))) {
dir.create(file.path(base,"NoBlock/GenericFigures"))
}
if (!dir.exists(file.path(base,"NoBlock/GeneBoxPlots"))) {
dir.create(file.path(base,"NoBlock/GeneBoxPlots"))
}
base = file.path(base, "NoBlock")
}
if (is.null(plot_annos)) {
plot_annos <- level
}
message("# FILE LOADING & PRE-FILTERING #\n")
### FILE LOADING & PRE-FILTERING ###
tx2gene = read.table(tx2gene, sep="\t")
samples <- read.table(samplesheet, header=TRUE)
rownames(samples) <- samples$name
files <- file.path(quantspath, samples$name, "quant.sf")
names(files) <- samples$name
# Read in our actual count files now.
txi <- tximport(files, type="salmon", tx2gene=tx2gene)
message(paste("\n\nDesign is:",design,".\n\n"))
# Create the DESeqDataSet object.
dds <- DESeqDataSetFromTximport(txi,
colData = samples,
design = design)
# Pre-filter transcripts with really low read counts (<100 across all samples)
message(paste("\ndds starting with",nrow(dds),"genes."))
dds <- dds[rowSums(counts(dds)) >= 100,]
message(paste("\ndds has",nrow(dds),"genes after filtering rows with under 100 reads total.\n"))
message("\n# VARIANCE STABILIZATION COMPARISONS #\n")
message(paste(base,"/GenericFigures/transformation.pdf\n", sep=""))
### VARIANCE STABILIZATION COMPARISONS ###
rld <- rlog(dds, blind = FALSE)
vsd <- vst(dds, blind = FALSE)
pdf(paste(base,"/GenericFigures/transformation.pdf", sep=""))
dds <- DESeq2::estimateSizeFactors(dds)
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
df
colnames(df)[1:2] <- c("x", "y")
p <- ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
print(p)
dev.off()
### SAMPLE DISTANCES ###
message(paste("\n# PLOTTING SAMPLE DISTANCES #\n", paste(base,"/GenericFigures/samp_dist.pdf\n", sep="")))
sampleDists <- dist(t(assay(rld)))
pdf(paste(base,"/GenericFigures/samp_dist.pdf", sep=""))
## ----distheatmap, fig.width = 6.1, fig.height = 4.5----------------------
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( colData(rld)[,level], rld$name, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
# Now with only the groups we want to compare.
rld.sub = rld[,colData(rld)[,level] %in% c(g1, g2)]
sampleDists <- dist(t(assay(rld.sub)))
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( colData(rld.sub)[,level], rld.sub$name, sep = " - " )
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
dev.off()
### PCA PLOTS ###
message(paste("# PCA PLOTS #\n",paste(base, "/GenericFigures/pca.pdf\n", sep='')))
pcaData <- DESeq2::plotPCA(rld, intgroup = plot_annos, returnData = TRUE)
pcaData
percentVar <- round(100 * attr(pcaData, "percentVar"))
pdf(paste(base, "/GenericFigures/pca.pdf", sep=''))
p <- DESeq2::plotPCA(rld, intgroup = plot_annos)
print(p)
## ----ggplotpca, fig.width=6, fig.height=4.5------------------------------
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = colData(rld)[,plot_annos[1]], shape = colData(rld)[,plot_annos[2]])) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed()
print(p)
# Now with only the groups we want to compare.
pcaData <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos, returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos)
print(p)
## ----ggplotpca, fig.width=6, fig.height=4.5------------------------------
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = colData(rld.sub)[,plot_annos[1]], shape = colData(rld.sub)[,plot_annos[2]])) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed()
print(p)
dev.off()
#======================================#
### DIFFERENTIAL EXPRESSION ANALYSIS ###
message("\n### DIFFERENTIAL EXPRESSION ANALYSIS ###\n")
dds <- DESeq(dds)
res <- lfcShrink(dds, contrast=c(level, g1, g2), type='ashr')
### PLOTTING THE RESULTS ###
# This gets all the normalized counts for all genes/samples.
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
# Make table for just DEGs.
resSig <- subset(resdata, padj <= 0.1)
### DEG BOXPLOTS ###
message("\n# DEG BOXPLOTS #\n")
message("Creating boxplots for all genes with padj <= 0.1")
# If you have more than 7 levels for your contrast, you may need to add colors here.
fill = c("#A6CEE3","#B2DF8A","#FDBF6F","#CAB2D6","#f4cae4","#f1e2cc","#b3e2cd")
line = c("#1F78B4","#33A02C","#FF7F00","#6A3D9A","#e7298a","#a6761d","#1b9e77")
for (i in 1:nrow(resSig)){
if (!file.exists(paste(base,"/GeneBoxPlots/",gsub('/','-',resSig$Gene[i]),".BoxPlot.pdf",sep=""))){
pdf(paste(base,"/GeneBoxPlots/",gsub('/','-',resSig$Gene[i]),".BoxPlot.pdf",sep=""))
d = plotCounts(dds, gene=resSig$Gene[i], intgroup=level)
d = plotCounts(dds, gene=resSig$Gene[i], intgroup=level, returnData = T)
p = ggplot(d, aes(x=d[,level], y=count)) + geom_boxplot(fill=fill[1:length(levels(colData(rld)[,level]))],colour=line[1:length(levels(colData(rld)[,level]))]) + ggtitle(resSig$Gene[i]) + coord_trans(y="log10")
print(p)
d = plotCounts(dds, gene=resSig$Gene[i], intgroup=level, returnData = T)
e = subset(d, (get(level) == g1 | get(level) == g2))
p = ggplot(e, aes(x=e[,level], y=count)) + geom_boxplot(fill=fill[1:2],colour=line[1:2]) + ggtitle(resSig$Gene[i]) + coord_trans(y="log10")
print(p)
dev.off()
}
}
### DEG PCA PLOTS ###
# TODO - Make this a generic function.
message(paste("\n# DEG PCA PLOTS #\n", base,"/DEG_pca.pdf\n",sep=''))
pdf(paste(base,"/DEG_pca.pdf", sep=''))
resdata <- merge(as.data.frame(res), as.data.frame(assay(rld)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
resSig <- subset(resdata, padj <= 0.1)
# Show only DEG genes and only the wanted groups.
rld.sub = rld[resSig$Gene, colData(rld)[,level] %in% c(g1, g2)]
p <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos) + ggtitle("DEGs - padj <= 0.1")
print(p)
p <- DESeq2::plotPCA(rld.sub, intgroup = level) + ggtitle("DEGs - padj <= 0.1")
print(p)
# Now with lfc threshold on genes as well.
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
# Show only DEG genes and only the wanted groups.
rld.sub = rld[resSig$Gene, colData(rld)[,level] %in% c(g1, g2)]
p <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos) + ggtitle("DEGs - padj <= 0.1 & LFC >|< 2")
print(p)
p <- DESeq2::plotPCA(rld.sub, intgroup = level) + ggtitle("DEGs - padj <= 0.1 & LFC >|< 2")
print(p)
resSig <- subset(resdata, padj <= 0.05)
# Show only DEG genes and only the wanted groups.
rld.sub = rld[resSig$Gene, colData(rld)[,level] %in% c(g1, g2)]
p <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos) + ggtitle("DEGs - padj <= 0.05")
print(p)
p <- DESeq2::plotPCA(rld.sub, intgroup = level) + ggtitle("DEGs - padj <= 0.05")
print(p)
# Now with lfc threshold on genes as well.
resSig <- subset(resdata, padj <= 0.05)
resSig <- subset(resSig, log2FoldChange >= 2 | log2FoldChange <= -2)
# Show only DEG genes and only the wanted groups.
rld.sub = rld[resSig$Gene, colData(rld)[,level] %in% c(g1, g2)]
p <- DESeq2::plotPCA(rld.sub, intgroup = plot_annos) + ggtitle("DEGs - padj <= 0.05 & LFC >|< 2")
print(p)
p <- DESeq2::plotPCA(rld.sub, intgroup = level) + ggtitle("DEGs - padj <= 0.05 & LFC >|< 2")
print(p)
dev.off()
### MA PLOT ###
message(paste("\n# MA PLOT #\n", base, "/MA_plot.pdf\n", sep=''))
pdf(paste(base,"/MA_plot.pdf", sep=''))
plotMA(res, ylim = c(-5, 5))
dev.off()
### GENE CLUSTERING - VARIABLE ###
# First with all samples included.
message(paste("\n# VARIABLE GENE CLUSTERING #\n", base, "/Top20_VariableGenes_HeatmapDistances.pdf\n", sep=''))
topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(rld)[, plot_annos])
pdf(paste(base,"/Top20_VariableGenes_HeatmapDistances.pdf", sep=''))
pheatmap(mat, annotation_col = anno, main="Top 20 Variable Genes")
#Then with only the wanted groups.
rld.sub = rld[, colData(rld)[,level] %in% c(g1, g2)]
topVarGenes <- head(order(rowVars(assay(rld.sub)), decreasing = TRUE), 20)
mat <- assay(rld.sub)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(rld.sub)[, plot_annos])
pheatmap(mat, annotation_col = anno, main="Top 20 Variable Genes")
dev.off()
### VOLCANO PLOT ###
message(paste("# VOLCANO PLOT #\n", getwd(), "/", substr(base, 2, 500), "/",g1,"-v-",g2,".DEGs.Volcano.html\n", sep=''))
# Set up results table.
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
diff_df = resdata[c("Gene","log2FoldChange","padj")]
# add a grouping column; default value is "not significant"
diff_df["group"] <- "NotSignificant"
# for our plot, we want to highlight
# padj < 0.1 (significance level)
# log2FoldChange > 2
# change the grouping for the entries with significance but not a large enough Fold change
diff_df[which(diff_df['padj'] <= 0.1 & abs(diff_df['log2FoldChange']) <= 2 ),"group"] <- "Significant"
# change the grouping for the entries with a large enough Fold change but not a low enough p value
diff_df[which(diff_df['padj'] >= 0.1 & abs(diff_df['log2FoldChange']) >= 2 ),"group"] <- "FoldChange"
# change the grouping for the entries with both significance and large enough fold change
diff_df[which(diff_df['padj'] <= 0.1 & abs(diff_df['log2FoldChange']) >= 2 ),"group"] <- "Significant&FoldChange"
# Find and label the top peaks.
top_peaks <- diff_df[with(diff_df, order(log2FoldChange, padj)),][1:5,]
top_peaks <- rbind(top_peaks, diff_df[with(diff_df, order(-log2FoldChange, padj)),][1:5,])
# Add gene labels for all of the top genes we found
a <- list(
x = top_peaks$log2FoldChange,
y = -log10(top_peaks$padj),
text = top_peaks$Gene,
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = 20,
ay = -40
)
p <- plot_ly(diff_df, x = ~log2FoldChange, y = -log10(diff_df$padj), text = ~Gene, color = ~group,mode = "markers", type="scatter") %>%
layout(title = paste(g1,"v",g2, sep='-'), xaxis=list(title="log2 Fold Change"), yaxis=list(title="-log10(adjusted p-value)"))
# Save plot - has to use absolute path because the htmltools package is apparently hot garbage.
htmlwidgets::saveWidget(as_widget(p), paste(getwd(), "/", substr(base, 2, 500), "/",g1,"-v-",g2,".DEGs.Volcano.html", sep=''))
### HEATMAPS ###
message(paste("\n# DEG HEATMAPS #\n", sep=''))
pdf(paste(base, "/DEG_Heatmaps.pdf", sep=''))
make.heatmaps(dds, res, rld, level, g1, g2, plot_annos)
dev.off()
### SAVING TABLES ###
message("\n# SAVING RESULTS TABLES #\n")
write.csv(resdata, file=paste(base, "/",g1,"-v-",g2,".ALL.csv",sep=''), row.names=F)
# Make table for just DEGs too.
resSig <- subset(resdata, padj <= 0.1)
write.csv(resSig, file=paste(base, "/",g1,"-v-",g2,".DEGs.padj0.1.csv", sep=''), row.names=F)
resSig <- subset(resdata, padj <= 0.05)
write.csv(resSig, file=paste(base, "/",g1,"-v-",g2,".DEGs.padj0.05.csv", sep=''), row.names=F)
return(list(res=res, dds=dds, rld=rld))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment