Skip to content

Instantly share code, notes, and snippets.

@mikelove
Last active April 8, 2022 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikelove/575736dafee943b508e0a9527e6cf1a4 to your computer and use it in GitHub Desktop.
Save mikelove/575736dafee943b508e0a9527e6cf1a4 to your computer and use it in GitHub Desktop.
DESeq2 and edgeR with basic QC and RUV
x <- read.csv("GSE91061_BMS038109Sample.hg19KnownGene.raw.csv", row.names=1)
condition <- factor(sub(".+_(.+)_.+", "\\1", colnames(x)))
library(DESeq2)
dds <- DESeqDataSetFromMatrix(x, colData=data.frame(condition), ~condition)
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd)
rv <- rowVars(assay(vsd))
pc <- prcomp(t(assay(vsd)[head(order(-rv),1000),]))
plot(pc$x[,1:2], col=condition)
idx <- pc$x[,1] < -25
sum(idx) # nine extreme outliers
plot(pc$x[,1:2], col=idx+1, pch=20, asp=1)
condition <- condition[!idx]
dds <- dds[,!idx]
# use minimal filtering with edgeR
library(edgeR)
y <- DGEList(counts=counts(dds), group=condition)
keep <- filterByExpr(y)
table(keep)
y <- y[keep,]
dds <- dds[keep,]
vsd <- vst(dds, blind=FALSE)
# still some structure in 2D PCA
plotPCA(vsd)
# 12 seconds
system.time({
dds <- DESeq(dds, test="LRT", reduced=~1, fitType="glmGamPoi")
})
res <- results(dds)
table(res$padj < .1)
library(RUVSeq)
set <- newSeqExpressionSet(counts(dds))
set <- betweenLaneNormalization(set, which="upper")
not_sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not_sig ]
set <- RUVg(set, empirical, k=5)
pdat <- pData(set)
pdat$condition <- condition
vsd$W1 <- pdat$W_1
vsd$W2 <- pdat$W_2
plotPCA(vsd, intgroup="W1")
plotPCA(vsd, intgroup="W2")
colData(dds) <- cbind(colData(dds), pdat[,1:5])
design(dds) <- ~W_1 + W_2 + W_3 + W_4 + W_5 + condition
# 25 seconds
system.time({
dds <- DESeq(dds, test="LRT", reduced=~W_1 + W_2 + W_3 + W_4 + W_5, fitType="glmGamPoi")
})
res <- results(dds)
table(res$padj < .1) # 19 genes
DESeq2::plotMA(res, ylim=c(-5,5))
res_sig <- res[which(res$padj < .1),]
idx <- which.max(abs(res_sig$log2FoldChange))
res_sig[idx,]
res["10911",]
plotCounts(dds, gene="10911")
y <- calcNormFactors(y)
design <- model.matrix(~W_1 + W_2 + W_3 + W_4 + W_5 + condition, data=pdat)
y <- estimateDisp(y, design)
qlfit <- glmQLFit(y, design)
qlft <- glmQLFTest(qlfit)
tt <- topTags(qlft, n=nrow(y))[[1]]
sum(tt$FDR < .1)
hist(tt$F, freq=FALSE)
F <- tt[rownames(res_sig),"F"]
lines(density(F[!is.na(F)]))
match(rownames(res_sig), rownames(tt))
# 8 12 7 3 2 15 14 13 4 1 11 18 6 19 5 16 9 21 17
table(rownames(res_sig) %in% head(rownames(tt), 20))
# 18 / 19
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment