Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active August 12, 2018 14:21
Show Gist options
  • Save dinovski/7e7eba1d390e35f87b97b48c2cbd34ba to your computer and use it in GitHub Desktop.
Save dinovski/7e7eba1d390e35f87b97b48c2cbd34ba to your computer and use it in GitHub Desktop.
differential expression analysis markdown
---
title: "__RNAseq Summary__"
output: pdf_document
date: '`r Sys.Date()`'
documentclass: article
classoption: a4paper
geometry: margin=2cm
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = FALSE, message=FALSE, warning=FALSE)
```
## *Differential Expression Analysis*
Genes with CPM > 0.5 in at least 2 samples were kept for differential analysis. This threshold was chosen based on the average log2CPM per gene to separate expressed genes from unexpressed genes. Raw count data was normalized with the TMM method and transformed to log2-CPM. A linear model was fit to the normalized data and empirical Bayes statistics were computed. Differentially expressed genes for each KO versus WT were identified from the linear fit after adjusting for multiple testing and filtered to include those with FDR < 0.05 and absolute logFC > 1.
R dependencies can be found at the end.
\pagebreak
```{r, echo=FALSE, message=FALSE}
require(rmarkdown)
require(edgeR)
require(limma)
require(Glimma)
require(ggplot2)
require(RColorBrewer)
require(gridExtra)
require(rtracklayer)
require(plyr)
require(dplyr)
require(reshape2)
require(DESeq2)
require(gplots)
require(ggrepel)
require(knitr)
require(xtable)
require(FactoMineR)
require(GenomicFeatures)
require(scater)
```
```{r, echo=FALSE, message=FALSE, warning=FALSE}
inPath=''
outPath=''
geneBedPath='gencode.v19.annotation.clean.bed'
gtfPath='gencode.v19.annotation.gtf'
statsPath='' ## output from mapStats.sh
```
```{r, echo=FALSE, results='hide', message=FALSE}
## load annotation and count files
geneBed<-read.table(geneBedPath, header=FALSE, sep="\t")
colnames(geneBed)<-c("chr","start","end","gene_id","gene_name")
## COUNT FILES: STAR
exprs.in <- list.files(path=inPath, pattern=".tab", full.names=TRUE, recursive=TRUE)
## apply(countFile[,2:4], 2, sum)
exprs.all <- lapply(exprs.in, function(file) {
countFile <- read.table(file, skip=4, header=FALSE, sep="\t")
countFile <- countFile[, c(1,4)]
name <- gsub(".R1_norRNAReadsPerGene.out.tab", "", basename(file))
colnames(countFile) <- c("gene_id", name)
return(countFile)
})
countTable <- Reduce(function(x,y) merge(x,y, all=TRUE, by="gene_id"), exprs.all)
ge<-countTable
rownames(ge)<-ge$gene_id
ge$gene_id<-NULL
```
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=8, fig.width=8, eval=TRUE}
## load stat files
stats.in<-list.files(path=statsPath, pattern=".txt", full.names=TRUE, recursive=FALSE)
options(scipen=999)
stats.all <- lapply(stats.in, function(file) {
inFile <- read.table(file, header=TRUE, sep=":", colClasses=c("character","numeric"))
f.name <- gsub("_mapStats.txt", "", basename(file))
return(inFile)
})
statsTable <- Reduce(function(x,y) merge(x,y, all=TRUE, by="ID"), stats.all)
rownames(statsTable)<-gsub(" ", "_", statsTable$ID)
statsTable$ID<-NULL
stats<-data.frame(t(as.matrix(statsTable)))
stats$CLONE<-rownames(stats)
options(scipen=0)
```
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=8, fig.width=8}
dups<-stats[,c("CLONE","DUP_RATE")]
dups$DUP_RATE=dups$DUP_RATE*100
dups<-dups[order(dups$DUP_RATE, decreasing=T), c(1,2)]
dp<-ggplot(data=dups, aes(x=CLONE, y=DUP_RATE)) + ylim(0,100) +
geom_bar(stat="identity", fill="steelblue4") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="white"),
axis.text=element_text(size=12),
axis.text.x=element_text(face="bold", size=10, angle=270),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12),
panel.border=element_rect(colour="white", fill=NA, size=1))
dp
```
\pagebreak
## *Mapping Stats*
```{r,echo=FALSE, fig.width=8, fig.height=8, fig.align='center', eval=TRUE}
## bam file; use input (ie. post trimming)
stats.s<-stats[order(stats$CLONE, decreasing=T), c("CLONE", "TRIMMED", "PRIMARY_BAM", "UNIQUE_BAM")]
colnames(stats.s)<-c("CLONE", "TOTAL" , "MAPPED", "UNIQUE")
#stats.s<-stats[order(stats$CLONE, decreasing=T), c("CLONE", "STAR_INPUT", "STAR_UNIQUE_NUM")]
#colnames(stats.s)<-c("CLONE", "TOTAL" , "UNIQUE")
dat.m <- melt(stats.s, id.vars="CLONE")
## overlay
## set position="dodge" to plot side by side
p1<-ggplot(dat.m, aes(x=CLONE,y=value, fill=variable)) +
geom_bar(stat="identity", position = "identity", alpha=1, width=0.9) + ylab("read count") +
scale_fill_manual("variable", values = c(TOTAL="firebrick", MAPPED="steelblue4", UNIQUE= "lightblue3")) +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="white"),
axis.text=element_text(size=12),
axis.text.x=element_text(face="bold", size=10, angle=270),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12),
panel.border=element_rect(colour="white", fill=NA, size=1))
p1
```
\pagebreak
```{r,echo=FALSE, fig.width=8, fig.height=8, fig.align='center', eval=TRUE}
## percentages
stats.p<-stats[,c("CLONE", "TRIMMED" ,"PRIMARY_BAM", "UNIQUE_BAM")]
stats.p$MAPPED=stats.p$PRIMARY_BAM/stats.p$TRIMMED * 100
stats.p$UNIQUE=stats.p$UNIQUE_BAM/stats.p$TRIMMED * 100
stats.p$TOTAL=100
stats.p<-stats.p[order(stats.p$TOTAL, decreasing=T), c("CLONE", "TOTAL", "MAPPED", "UNIQUE")]
dat.m <- melt(stats.p, id.vars="CLONE")
p2<-ggplot(dat.m, aes(x=CLONE, y=value, fill=variable)) +
geom_bar(stat="identity", position = "identity", alpha=1, width=0.9) + ylab("% reads") +
scale_fill_manual("variable", values = c(TOTAL="firebrick", MAPPED="steelblue4", UNIQUE= "lightblue3")) +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="white"),
axis.text=element_text(size=12),
axis.text.x=element_text(face="bold", size=10, angle=270),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12),
panel.border=element_rect(colour="white", fill=NA, size=1))
p2
```
\pagebreak
## *gene based saturation*
```{r, echo=FALSE, echo=F, message=F, fig.width=8, fig.height=5, fig.align='center', eval=TRUE}
## gene based saturation: downsample reads and quantify num. genes with logCPM>0
## Estimate saturation of genes based on rarefaction of reads
## counts : a matrix of counts
## max_reads : maximum number of reads to downsample
## ndepths : resampling levels
## nreps : number of times the subsampling is performed to calculate a variance
## mincounts : minimum counts level to consider a gene as expressed. If NA, the threshold is fixed to 1 CPM
## extend.lines : If TRUE, the max number of detected genes is returned when the maximum number of reads is reached.
estimate_saturation <- function(counts, max_reads=Inf, ndepths=6, nreps=1, mincounts=NA, extend.lines=FALSE){
stopifnot(require(S4Vectors))
counts <- as.matrix(counts)
readsums <- colSums(counts)
max_reads <- min(max(readsums), max_reads)
depths <- round(seq(from=0, to=max_reads, length.out=ndepths+1))
saturation <- mclapply(1:ncol(counts), function(k){
message("Processing sample ", colnames(counts)[k], "...")
x <- counts[,k]
nreads <- sum(x)
## minimum expression levels
if (is.na(mincounts)){
mincounts <- nreads / 1e6
}
## max number of detected genes
ngenes_detected <- length(which(x>=mincounts))
probs <- x / nreads ## calculate gene probabilities for the library
probs <- probs[probs > 0] ## zero counts add nothing but computational time!
ngenes <- length(probs)
res <- lapply(depths, function(dp, nreps=1, ...){
rsim <- c(NA, NA)
if (extend.lines)
rsim <- c(ngenes_detected, NA)
if (dp <= nreads){
estim <- lapply(1:nreps, function(i, ngenes, dp, probs, mincounts){
csim <- sample(x=ngenes, size=dp, replace=TRUE, prob=probs)
length(which(runLength(Rle(sort(csim)))>=mincounts))
}, ngenes=ngenes, dp=dp, probs=probs, mincounts=mincounts)
rsim <- c(mean(unlist(estim)), var(unlist(estim)))
}
return(rsim)
}, nreps=nreps, nreads=nreads, ngenes=ngenes, probs=probs, mincounts=mincounts)
data.frame(depths=depths,
sat.estimates=sapply(res, "[[", 1),
sat.var.estimates=sapply(res, "[[", 2))
})
names(saturation) <- colnames(counts)
return(saturation)
}
sat <- estimate_saturation(counts=ge, ndepths=10, nreps=1, extend.lines=TRUE)
d2p <- melt(sat, id.vars=c("depths", "sat.estimates", "sat.var.estimates"))
selcol<-colorRampPalette(brewer.pal(9, "Set1"))(length(sat))
gg <- ggplot(d2p) + geom_point(aes(x=depths, y=sat.estimates), color="darkgray", shape=3, size=2) + geom_path(aes(x=depths, y=sat.estimates, color=L1)) +
theme_classic() + theme(legend.position="bottom", legend.title=element_blank()) +
labs(x="Sequencing depth", y="Number of detected genes (>=1 CPM)") + theme(axis.title = element_text(face="bold", colour="black", size=12)) +
scale_color_manual(values=selcol, guide=guide_legend(nrow=ceiling(length(sat)/6)))
gg
```
```{r preseq, echo=FALSE, eval=FALSE}
preseq.in <- list.files(inPath, pattern=".preseq", recursive=TRUE, full.names=TRUE)
if (length(preseq.in) > 0){
preseq.info <- lapply(preseq.in, read.table, header=TRUE)
names(preseq.info) <- bioids[basename(sub("preseq","",dirname(preseq.in)))]
d2p <- melt(preseq.info, id.vars=c("total_reads","distinct_reads"))
selcol <- colorRampPalette(brewer.pal(9, "Set1"))(length(preseq.info))
gg <- ggplot(d2p) + geom_point(aes(x=total_reads, y=distinct_reads), color="darkgray", shape=3, size=2) + geom_path(aes(x=total_reads, y=distinct_reads, color=L1)) +
theme_classic() + theme(legend.position="bottom", legend.title=element_blank()) +
labs(x="Total Reads Number", y="Distinct Reads Number") + theme(axis.title = element_text(face="bold", colour="black", size=12)) +
scale_color_manual(values=selcol, guide=guide_legend(nrow=ceiling(length(preseq.info)/6)))
gg
#ggly_b <- plotly_build(gg)
#ggly_b <- inactivate_hover(ggly_b)
#ggly_b <- plotly_margins(ggly_b)
#ggly_b
}else{
warning("Preseq output files not found !")
}
```
```{r, echo=FALSE, results='hide', message=FALSE, eval=FALSE}
## TPM from raw counts: scale by feature length then by scale factor: [count/len]/[sum(count/len)/10^6]
## sum of all TPMs in each sample are the same, ie. easier to compare samples
# ## get gene sizes based on exons > exons per gene id>reduce to non overlapping exons, sum lengths
# txdb <- makeTxDbFromGFF(gtfPath, format="gtf")
# exons.per.gene <- exonsBy(txdb, by="gene")
# exonic.gene.sizes <- sum(width(reduce(exons.per.gene)))
# exonic.gene.sizes <- exonic.gene.sizes[rownames(ge)]
#
# ge.sce<-newSCESet(countData=ge)
# ge.tpm<-calculateTPM(ge.sce, calc_from="counts", effective_length=exonic.gene.sizes)
# ge.tpm<-round(ge.tpm, 2)
#
# ge.tpm<-ge.tpm[apply(ge.tpm, MARGIN=1, function(x) { any(x>0) }),]
#
# df.tpm<-as.data.frame(ge.tpm)
# df.tpm$gene_id<-rownames(df.tpm)
#
# ge.tpm.merge<-merge(geneBed, df.tpm, by="gene_id", all.x=T)
# write.table(ge.tpm.merge, file=paste0(outPath,"prc2integrative_tpm.txt"), quote=F, row.names=T, sep="\t")
## manual TPM calculation
## Calculate reads per kilobase
#rpk <- ge * 10^3/matrix(exonic.gene.sizes, nrow=nrow(ge), ncol=ncol(ge), byrow=FALSE)
## normalize by lib size
#tpm <- rpk * matrix(10^6 / colSums(rpk), nrow=nrow(rpk), ncol=ncol(rpk), byrow=TRUE)
#tpm<-round(tpm,2)
```
## *CPM filter*
```{r,echo=FALSE, fig.width=8, fig.height=4, eval=TRUE}
x<-ge
cpm <- edgeR::cpm(x)
lcpm <- edgeR::cpm(x, log=TRUE)
## raw data
nsamples <- ncol(x)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2,
main="", xlab="")
title(main="Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("topright", samplenames, text.col=col, bty="n")
## filter data
keep.exprs <- rowSums(cpm>1)>=2
x <- x[keep.exprs,]
lcpm <- edgeR::cpm(x, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2,
main="", xlab="")
title(main="Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("topright", samplenames, text.col=col, bty="n")
```
\pagebreak
```{r, echo=FALSE, results='hide', message=FALSE}
## filter lowly expressed genes
#ge<-ge[apply(ge, MARGIN=1, function(x) { any(x>0) }),]
ge.cpm <- edgeR::cpm(ge, log=FALSE)
## check threshold: compute av. log2CPM per gene
a.cpm<-edgeR::aveLogCPM(ge)
#hist(a.cpm, xlab="CPM")
#abline(v=log(0))
keep.exprs <- rowSums(ge.cpm > 0.5)>=2
ge <- ge[keep.exprs,]
## experiment details
names.tmp<-gsub("-1", "", colnames(ge))
names.tmp<-gsub("-2", "", names.tmp)
condition=dput(names.tmp)
exp.all <- data.frame(samples=colnames(ge),
condition=condition)
## rlog
condition<-factor(condition)
dds <- DESeqDataSetFromMatrix(ge, DataFrame(condition), ~condition)
ge.rlog <- rlog(dds, blind=FALSE)
ge.rlog <- assay(ge.rlog)
```
## *Hierarchical clustering*
```{r, echo=FALSE, message=FALSE, warning=FALSE, results="asis"}
plot(hclust(as.dist(1 - cor(ge.rlog, method = "pearson")), method="ward.D2"), label=exp.all$condition)
```
## *PCA by sample*
```{r, echo=FALSE, message=FALSE, warning=FALSE, results="hide"}
resPCA <- PCA(t(ge.rlog), scale.unit=FALSE, graph=FALSE)
label <- dput(rownames(resPCA$ind$coord))
ggplot(as.data.frame(resPCA$ind$coord), aes(x=resPCA$ind$coord[,1],y=resPCA$ind$coord[,2], label=label)) +
geom_point() + xlab(paste("PC1 ", round(resPCA$eig[1,2]),"% of variance", sep="")) +
ylab(paste("PC2 ",round(resPCA$eig[2,2]),"% of variance",sep="")) + geom_text(vjust=1.5) +
geom_hline(yintercept=0, linetype="dashed") +
geom_vline(xintercept=0, linetype="dashed") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
```
\pagebreak
## *Differential Expression*
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
## fit model to individual covariate
exp.sel<-c("WT", "Ring1AB", "Ring1AB_BAP1")
## limma+voom: input raw data: apply log2CPM norm with voom
exp <- exp.all[exp.all$condition %in% exp.sel,]
ge.filt <- ge[colnames(ge) %in% exp$samples]
dge<-DGEList(counts=ge.filt[,colnames(ge.filt)], genes=rownames(ge.filt), group=exp$condition)
## design matrix for experimental conditions
cond <- factor(exp$condition)
model <- model.matrix(~0+cond)
colnames(model) <- levels(cond)
rownames(model) <- exp$samples
## TMM Normalization
dge <- calcNormFactors(dge, method="TMM")
## limma+voom: convert counts to log2(CPM) to estimate mean-variance relationship
v <- voom(dge, model, plot=FALSE)
vfit <- lmFit(v, model)
## output TMM norm log2CPM
## run all contrasts at once, then specify column of model in topTreat
contr.matrix <- makeContrasts(
Ring1ABvWT = Ring1AB - WT,
Ring1AB_BAP1vRing1AB = Ring1AB_BAP1 - Ring1AB,
Ring1AB_BAP1vWT = Ring1AB_BAP1 - WT,
levels = colnames(model))
## compute coeffs and standard errors for contrast
vfit.c <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit.c)
## sig DE genes
dt<-decideTests(efit, adjust.method = "BH", p.value = 0.05)
dt.1<-dt
#####
## Top DE
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=1)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=1)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[1]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p + geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
```
\pagebreak
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
## Top DE
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=2)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=2)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[2]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p + geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
#+ geom_text_repel(data=head(df.fit.sig, 1), aes(label=gene_id))
```
\pagebreak
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
## Top DE
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=3)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=3)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[3]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p + geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
```
\pagebreak
## *DEA 2*
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
exp.sel<-c("WT", "Ring1A", "Ring1B", "Ring1AB")
## limma+voom: input raw data: apply log2CPM norm with voom
exp <- exp.all[exp.all$condition %in% exp.sel,]
ge.filt <- ge[colnames(ge) %in% exp$samples]
dge<-DGEList(counts=ge.filt[,colnames(ge.filt)], genes=rownames(ge.filt), group=exp$condition)
## design matrix for experimental conditions
cond <- factor(exp$condition)
model <- model.matrix(~0+cond)
colnames(model) <- levels(cond)
rownames(model) <- exp$samples
## TMM Normalization
dge <- calcNormFactors(dge, method="TMM")
## limma+voom: convert counts to log2(CPM) to estimate mean-variance relationship
v <- voom(dge, model, plot=FALSE)
vfit <- lmFit(v, model)
## run all contrasts at once, then specify contrast
contr.matrix <- makeContrasts(
Ring1AvRing1B = Ring1A - Ring1B,
Ring1AvRing1AB = Ring1A - Ring1AB,
Ring1BvRing1AB = Ring1B - Ring1AB,
Ring1AvWT = Ring1A - WT,
Ring1BvWT = Ring1B - WT,
Ring1ABvWT = Ring1AB - WT,
levels = colnames(model))
## compute coeffs and standard errors for contrast
vfit.c <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit.c)
## sig DE genes: contrast 1
dt<-decideTests(efit, adjust.method = "BH", p.value = 0.05)
dt.2<-dt
## Top DE
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=1)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=1)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[1]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p+geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
```
\pagebreak
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=2)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=2)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[2]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p + geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
```
\pagebreak
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6}
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=3)
## MD plot
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=3)
#hist(tt.all$P.Value)
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr)
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr")
## add gene names
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE)
## output top-ranked genes from lmfit
contrast<-colnames(contr.matrix)[3]
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t")
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not"))
p<-ggplot()
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1)
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1)
## avg expression (log2CPM) v. fold change (logFC)
md<-p + geom_hline(yintercept=0, linetype="dashed") +
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) +
ggtitle(contrast) +
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) +
xlab("avg logCPM") + ylab("logFC") +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_blank(), axis.line=element_line(colour="black"),
panel.border=element_rect(colour="black", fill=NA, size=1),
plot.title = element_text(size=18, face = "bold"),
axis.text=element_text(size=16, family="sans", colour="black"),
axis.title.x=element_text(size=16, family="sans", colour="black"),
axis.title.y=element_text(size=16, family="sans", colour="black"))
md
```
\pagebreak
```{r session, results="asis"}
si<-sessionInfo()
print(si)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment