Created
September 18, 2012 19:23
-
-
Save stephenturner/3745236 to your computer and use it in GitHub Desktop.
DESeq vs edgeR comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: using the devel versions of both packages! | |
library(DESeq) # version 1.9.11 | |
library(edgeR) # version 2.99.8 | |
library(VennDiagram) | |
# Read in data ------------------------------------------------------------ | |
## Use pasilla data | |
datafile = system.file( "extdata/pasilla_gene_counts.tsv", package="pasilla" ) | |
datafile | |
## Read in the data making the row names the first column | |
counttable <- read.table(datafile, header=T, row.names=1) | |
head(counttable) | |
## Make metadata data.frame | |
meta <- data.frame( | |
row.names=colnames(counttable), | |
condition=c("untreated", "untreated", "untreated", "untreated", "treated", "treated", "treated"), | |
libType=c("single", "single", "paired", "paired", "single", "paired", "paired")) | |
meta$condition <- relevel(meta$condition, ref="untreated") | |
meta | |
## Independent filtering? | |
# keep_cpm <- rowSums(cpm(counttable)>2) >=2 | |
# keep_quantile <- rowSums(counttable)>quantile(rowSums(counttable), probs=.5) | |
# addmargins(table(keep_cpm, keep_quantile)) | |
# counttable <- counttable[keep_cpm, ] | |
# DESeq ------------------------------------------------------------------- | |
## Make a new countDataSet | |
d <- newCountDataSet(counttable, meta) | |
## Estimate library size and dispersion | |
d <- estimateSizeFactors(d) | |
d <- estimateDispersions(d) | |
plotDispEsts(d, main="DESeq: Per-gene dispersion estimates") | |
## Principal components biplot on variance stabilized data, color-coded by condition-librarytype | |
print(plotPCA(varianceStabilizingTransformation(d), intgroup=c("condition", "libType"))) | |
## Fit full and reduced models, get p-values | |
dfit1 <- fitNbinomGLMs(d, count~libType+condition) | |
dfit0 <- fitNbinomGLMs(d, count~libType) | |
dpval <- nbinomGLMTest(dfit1, dfit0) | |
dpadj <- p.adjust(dpval, method="BH") | |
## Make results table with pvalues and adjusted p-values | |
dtable <- transform(dfit1, pval=dpval, padj=dpadj) | |
dtable <- dtable[order(dtable$padj), ] | |
head(dtable) | |
# edgeR ------------------------------------------------------------------- | |
## Make design matrix | |
condition <- relevel(factor(meta$condition), ref="untreated") | |
libType <- factor(meta$libType) | |
edesign <- model.matrix(~libType+condition) | |
## Make new DGEList, normalize by library size, and estimate dispersion allowing possible trend with average count size | |
e <- DGEList(counts=counttable) | |
e <- calcNormFactors(e) | |
e <- estimateGLMCommonDisp(e, edesign) | |
e <- estimateGLMTrendedDisp(e, edesign) | |
e <- estimateGLMTagwiseDisp(e, edesign) | |
## MDS Plot | |
plotMDS(e, main="edgeR MDS Plot") | |
## Biological coefficient of variation plot | |
plotBCV(e, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") | |
## Fit the model, testing the coefficient for the treated vs untreated comparison | |
efit <- glmFit(e, edesign) | |
efit <- glmLRT(efit, coef="conditiontreated") | |
## Make a table of results | |
etable <- topTags(efit, n=nrow(e))$table | |
etable <- etable[order(etable$FDR), ] | |
head(etable) | |
## ~MA Plot | |
with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) | |
with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) | |
abline(h=c(-1,1), col="blue") | |
# Comparison -------------------------------------------------------------- | |
head(etable) | |
head(dtable) | |
addmargins(table(sig.edgeR=etable$FDR<0.05, sig.DESeq=dtable$padj<0.05)) | |
merged <- merge(etable, dtable, by='row.names') | |
with( merged, plot(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="black", main="Fold change for DESeq vs edgeR")) | |
with(subset(merged, FDR<0.05), points(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="red")) | |
with(subset(merged, padj<0.05), points(logFC, conditiontreated, xlab="logFC edgeR", ylab="logFC DESeq", pch=20, col="green")) | |
legend("topleft", xjust=1, yjust=1, legend=c("FDR<0.05 edgeR only", "FDR<0.05 DESeq & edgeR", "FDR>0.05"), pch=20, col=c("red", "green", "black"), bty="n") | |
# > sessionInfo() | |
# R version 2.15.0 (2012-03-30) | |
# Platform: i386-apple-darwin9.8.0/i386 (32-bit) | |
# | |
# locale: | |
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 | |
# | |
# attached base packages: | |
# [1] stats graphics grDevices utils datasets methods base | |
# | |
# other attached packages: | |
# [1] DESeq_1.9.11 lattice_0.20-10 locfit_1.5-8 | |
# [4] Biobase_2.16.0 BiocGenerics_0.2.0 edgeR_2.99.8 | |
# [7] limma_3.12.1 BiocInstaller_1.4.7 | |
# | |
# loaded via a namespace (and not attached): | |
# [1] annotate_1.34.1 AnnotationDbi_1.18.1 DBI_0.2-5 | |
# [4] genefilter_1.38.0 geneplotter_1.34.0 grid_2.15.0 | |
# [7] IRanges_1.14.4 RColorBrewer_1.0-5 RSQLite_0.11.1 | |
# [10] splines_2.15.0 stats4_2.15.0 survival_2.36-14 | |
# [13] tools_2.15.0 XML_3.9-4 xtable_1.7-0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Stephen: First I like your post on the comparison of DESeq and edgeR - I used both packages in my research. I have a question while running the code: it seems that the function plotDispEsts() no longer exists in DESeq v.1.8.3. I installed the package on my Mac OS, and found out the function is missing. Any advice will be greatly appreciated! - Gu