public
Created

DESeq vs edgeR comparison

  • Download Gist
deseq-vs-edger.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
# Note: using the devel versions of both packages!
library(DESeq) # version 1.9.11
library(edgeR) # version 2.99.8
library(VennDiagram)
 
# Read in data ------------------------------------------------------------
 
## Use pasilla data
datafile = system.file( "extdata/pasilla_gene_counts.tsv", package="pasilla" )
datafile
 
## Read in the data making the row names the first column
counttable <- read.table(datafile, header=T, row.names=1)
head(counttable)
 
## Make metadata data.frame
meta <- data.frame(
row.names=colnames(counttable),
condition=c("untreated", "untreated", "untreated", "untreated", "treated", "treated", "treated"),
libType=c("single", "single", "paired", "paired", "single", "paired", "paired"))
meta$condition <- relevel(meta$condition, ref="untreated")
meta
 
## Independent filtering?
# keep_cpm <- rowSums(cpm(counttable)>2) >=2
# keep_quantile <- rowSums(counttable)>quantile(rowSums(counttable), probs=.5)
# addmargins(table(keep_cpm, keep_quantile))
# counttable <- counttable[keep_cpm, ]
 
# DESeq -------------------------------------------------------------------
 
## Make a new countDataSet
d <- newCountDataSet(counttable, meta)
 
## Estimate library size and dispersion
d <- estimateSizeFactors(d)
d <- estimateDispersions(d)
plotDispEsts(d, main="DESeq: Per-gene dispersion estimates")
 
## Principal components biplot on variance stabilized data, color-coded by condition-librarytype
print(plotPCA(varianceStabilizingTransformation(d), intgroup=c("condition", "libType")))
 
## Fit full and reduced models, get p-values
dfit1 <- fitNbinomGLMs(d, count~libType+condition)
dfit0 <- fitNbinomGLMs(d, count~libType)
dpval <- nbinomGLMTest(dfit1, dfit0)
dpadj <- p.adjust(dpval, method="BH")
 
## Make results table with pvalues and adjusted p-values
dtable <- transform(dfit1, pval=dpval, padj=dpadj)
dtable <- dtable[order(dtable$padj), ]
head(dtable)
 
# edgeR -------------------------------------------------------------------
 
## Make design matrix
condition <- relevel(factor(meta$condition), ref="untreated")
libType <- factor(meta$libType)
edesign <- model.matrix(~libType+condition)
 
## Make new DGEList, normalize by library size, and estimate dispersion allowing possible trend with average count size
e <- DGEList(counts=counttable)
e <- calcNormFactors(e)
e <- estimateGLMCommonDisp(e, edesign)
e <- estimateGLMTrendedDisp(e, edesign)
e <- estimateGLMTagwiseDisp(e, edesign)
 
## MDS Plot
plotMDS(e, main="edgeR MDS Plot")
 
## Biological coefficient of variation plot
plotBCV(e, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
 
## Fit the model, testing the coefficient for the treated vs untreated comparison
efit <- glmFit(e, edesign)
efit <- glmLRT(efit, coef="conditiontreated")
 
## Make a table of results
etable <- topTags(efit, n=nrow(e))$table
etable <- etable[order(etable$FDR), ]
head(etable)
 
## ~MA Plot
with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
abline(h=c(-1,1), col="blue")
 
# Comparison --------------------------------------------------------------
 
head(etable)
head(dtable)
 
addmargins(table(sig.edgeR=etable$FDR<0.05, sig.DESeq=dtable$padj<0.05))
 
merged <- merge(etable, dtable, by='row.names')
with( merged, plot(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="black", main="Fold change for DESeq vs edgeR"))
with(subset(merged, FDR<0.05), points(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="red"))
with(subset(merged, padj<0.05), points(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="green"))
legend("topleft", xjust=1, yjust=1, legend=c("FDR<0.05 edgeR only", "FDR<0.05 DESeq & edgeR", "FDR>0.05"), pch=20, col=c("red", "green", "black"), bty="n")
 
 
# > sessionInfo()
# R version 2.15.0 (2012-03-30)
# Platform: i386-apple-darwin9.8.0/i386 (32-bit)
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] DESeq_1.9.11 lattice_0.20-10 locfit_1.5-8
# [4] Biobase_2.16.0 BiocGenerics_0.2.0 edgeR_2.99.8
# [7] limma_3.12.1 BiocInstaller_1.4.7
#
# loaded via a namespace (and not attached):
# [1] annotate_1.34.1 AnnotationDbi_1.18.1 DBI_0.2-5
# [4] genefilter_1.38.0 geneplotter_1.34.0 grid_2.15.0
# [7] IRanges_1.14.4 RColorBrewer_1.0-5 RSQLite_0.11.1
# [10] splines_2.15.0 stats4_2.15.0 survival_2.36-14
# [13] tools_2.15.0 XML_3.9-4 xtable_1.7-0

Hi Stephen: First I like your post on the comparison of DESeq and edgeR - I used both packages in my research. I have a question while running the code: it seems that the function plotDispEsts() no longer exists in DESeq v.1.8.3. I installed the package on my Mac OS, and found out the function is missing. Any advice will be greatly appreciated! - Gu

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.