sjackman/stat540-seminar8.R

## stat540-seminar8.R
# STAT 540 Seminar 8
# Shaun Jackman

# edgeR
library(edgeR)
dat <- read.table("../data/bottomly/bottomly_count_table.txt",
	header = TRUE, row.names = 1)
des <- read.table("../data/bottomly/bottomly_phenodata.txt",
	header = TRUE, row.names = 1)
all(rownames(des) == colnames(dat))

# GLM edgeR
table(des$strain)
group <- factor(c(rep("1", 10), rep(2, 11)))
dge.glm <- DGEList(counts = dat, group = group)
design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(fit, coef = 2)
topTags(lrt)
tt.glm <- topTags(lrt, n = Inf)
nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])
interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
cpm(dge.glm.tag.disp)[interestingSamples, ]
summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
# Mini exercise:
# Redo the above analysis but first filter the data and remove any gene that
# has: 1. count equal tot zero across all samples 2. count equal to zero in at
# least one sample in each genotype group
do.edgeR <- function(dat) {
	dge.glm <- DGEList(counts = dat, group = group)
	design <- model.matrix(~group)
	dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
	dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
	dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
	plotBCV(dge.glm.tag.disp)
	fit <- glmFit(dge.glm.tag.disp, design)
	lrt <- glmLRT(fit, coef = 2)
	cat('Top genes:')
	print(topTags(lrt))
	tt.glm <- topTags(lrt, n = Inf)
	cat('Number of genes with FDR < 0.01:')
	print(sum(tt.glm$table$FDR < 0.01))
	interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
	cat('Interesting genes:')
	print(cpm(dge.glm.tag.disp)[interestingSamples, ])
	cat('Interesting genes with BH FDR < 0.05:')
	print(table(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")))
	tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
	plotSmear(lrt, de.tags = tags.glm)
	abline(h = c(-2, 2), col = "blue")
	de.glm
}
edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !all(x == 0))))
edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !any(x == 0))))

# DESeq
library(DESeq)
deSeqDat <- newCountDataSet(dat, group)
deSeqDat <- estimateSizeFactors(deSeqDat)
deSeqDat <- estimateDispersions(deSeqDat)
plotDispEsts(deSeqDat)
deseq.results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
plotMA(deseq.results)

# Voom & limma
library(limma)
dat.voomed <- voom(dat, design)
fit <- eBayes(lmFit(dat.voomed, design))

# Take Home Problem
# Choose a specific threshold for the adjusted p value, find the genes
# identified as differentially expressed using each of edgeR, DESeq and
# voom+limma. Compare the number of genes in these 3 lists, and draw a venn
# digram demonstrating the overlap (if any!).
edgeR.hits <- do.edgeR(dat)
table(deseq.results$padj < 0.05)
voom.hits <- topTable(fit, coef='group2', num=Inf, sort.by='none')
table(voom.hits$adj.P.Val < 0.05)
results <- data.frame(
	edgeR = edgeR.hits != 0,
	DESeq = deseq.results$padj < 0.05,
	voom.limma = voom.hits$adj.P.Val < 0.05)
table(results)
vennDiagram(results)
	# STAT 540 Seminar 8
	# Shaun Jackman

	# edgeR
	library(edgeR)
	dat <- read.table("../data/bottomly/bottomly_count_table.txt",
	header = TRUE, row.names = 1)
	des <- read.table("../data/bottomly/bottomly_phenodata.txt",
	header = TRUE, row.names = 1)
	all(rownames(des) == colnames(dat))

	# GLM edgeR
	table(des$strain)
	group <- factor(c(rep("1", 10), rep(2, 11)))
	dge.glm <- DGEList(counts = dat, group = group)
	design <- model.matrix(~group)
	dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
	dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
	dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
	plotBCV(dge.glm.tag.disp)
	fit <- glmFit(dge.glm.tag.disp, design)
	lrt <- glmLRT(fit, coef = 2)
	topTags(lrt)
	tt.glm <- topTags(lrt, n = Inf)
	nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])
	interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
	cpm(dge.glm.tag.disp)[interestingSamples, ]
	summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))
	tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
	plotSmear(lrt, de.tags = tags.glm)
	abline(h = c(-2, 2), col = "blue")
	# Mini exercise:
	# Redo the above analysis but first filter the data and remove any gene that
	# has: 1. count equal tot zero across all samples 2. count equal to zero in at
	# least one sample in each genotype group
	do.edgeR <- function(dat) {
	dge.glm <- DGEList(counts = dat, group = group)
	design <- model.matrix(~group)
	dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
	dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
	dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
	plotBCV(dge.glm.tag.disp)
	fit <- glmFit(dge.glm.tag.disp, design)
	lrt <- glmLRT(fit, coef = 2)
	cat('Top genes:')
	print(topTags(lrt))
	tt.glm <- topTags(lrt, n = Inf)
	cat('Number of genes with FDR < 0.01:')
	print(sum(tt.glm$table$FDR < 0.01))
	interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
	cat('Interesting genes:')
	print(cpm(dge.glm.tag.disp)[interestingSamples, ])
	cat('Interesting genes with BH FDR < 0.05:')
	print(table(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")))
	tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
	plotSmear(lrt, de.tags = tags.glm)
	abline(h = c(-2, 2), col = "blue")
	de.glm
	}
	edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !all(x == 0))))
	edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !any(x == 0))))

	# DESeq
	library(DESeq)
	deSeqDat <- newCountDataSet(dat, group)
	deSeqDat <- estimateSizeFactors(deSeqDat)
	deSeqDat <- estimateDispersions(deSeqDat)
	plotDispEsts(deSeqDat)
	deseq.results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
	plotMA(deseq.results)

	# Voom & limma
	library(limma)
	dat.voomed <- voom(dat, design)
	fit <- eBayes(lmFit(dat.voomed, design))

	# Take Home Problem
	# Choose a specific threshold for the adjusted p value, find the genes
	# identified as differentially expressed using each of edgeR, DESeq and
	# voom+limma. Compare the number of genes in these 3 lists, and draw a venn
	# digram demonstrating the overlap (if any!).
	edgeR.hits <- do.edgeR(dat)
	table(deseq.results$padj < 0.05)
	voom.hits <- topTable(fit, coef='group2', num=Inf, sort.by='none')
	table(voom.hits$adj.P.Val < 0.05)
	results <- data.frame(
	edgeR = edgeR.hits != 0,
	DESeq = deseq.results$padj < 0.05,
	voom.limma = voom.hits$adj.P.Val < 0.05)
	table(results)
	vennDiagram(results)