mschubert/diff_expr.r

## diff_expr.r
#
# loading the table, converting it to an expression matrix
#
df = read.table(
    "GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt",
    header=TRUE, sep="\t", stringsAsFactors=FALSE)
expr = data.matrix(df[2:nrow(df), df[1,]=="raw_count"])
rownames(expr) = df[2:nrow(df), 1]

#
# filtering the row- and column names to what we want
#
colnames(expr) = substr(colnames(expr), 15, 15)
rownames(expr) = sub("\\|.*$", "", rownames(expr))
expr = expr[rownames(expr) != "?",]

#
# transform the raw read counts using voom
#  -- with this, we can use standard linear modeling tools
#
# if you start with a .bam file instead of the TCGA text file,
# there are more specific tools to get differential expression
#  -- see DESeq2, edgeR packages
#
library(limma)
vexpr = voom(expr)$E
axpr = avereps(vexpr) # average if multiple measurements on same gene

#
# use lmFit to compute differential expression
#
design = model.matrix(~ 0 + as.factor(colnames(axpr)))
colnames(design) = c("primary", "recurrent")
fit.1 = lmFit(axpr, design)
contrast = makeContrasts("primary-recurrent", levels=design)
fit.2 = contrasts.fit(fit.1, contrast)
fit.3 = eBayes(fit.2)

#
# make a summary table ordered by p-value
#
result = data.frame(
    gene = rownames(fit.3),
    p.value = fit.3$p.value[,1],
    fold_change = fit.3$coefficients[,1],
    stringsAsFactors = FALSE
)
result$adj.p = p.adjust(result$p.value, method="fdr")
result = result[order(result$p.value),]
cat(result$gene[result$adj.p < 0.01], sep="\n") # print all genes w/ FDR<1%
	#
	# loading the table, converting it to an expression matrix
	#
	df = read.table(
	"GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt",
	header=TRUE, sep="\t", stringsAsFactors=FALSE)
	expr = data.matrix(df[2:nrow(df), df[1,]=="raw_count"])
	rownames(expr) = df[2:nrow(df), 1]

	#
	# filtering the row- and column names to what we want
	#
	colnames(expr) = substr(colnames(expr), 15, 15)
	rownames(expr) = sub("\\\|.*$", "", rownames(expr))
	expr = expr[rownames(expr) != "?",]

	#
	# transform the raw read counts using voom
	# -- with this, we can use standard linear modeling tools
	#
	# if you start with a .bam file instead of the TCGA text file,
	# there are more specific tools to get differential expression
	# -- see DESeq2, edgeR packages
	#
	library(limma)
	vexpr = voom(expr)$E
	axpr = avereps(vexpr) # average if multiple measurements on same gene

	#
	# use lmFit to compute differential expression
	#
	design = model.matrix(~ 0 + as.factor(colnames(axpr)))
	colnames(design) = c("primary", "recurrent")
	fit.1 = lmFit(axpr, design)
	contrast = makeContrasts("primary-recurrent", levels=design)
	fit.2 = contrasts.fit(fit.1, contrast)
	fit.3 = eBayes(fit.2)

	#
	# make a summary table ordered by p-value
	#
	result = data.frame(
	gene = rownames(fit.3),
	p.value = fit.3$p.value[,1],
	fold_change = fit.3$coefficients[,1],
	stringsAsFactors = FALSE
	)
	result$adj.p = p.adjust(result$p.value, method="fdr")
	result = result[order(result$p.value),]
	cat(result$gene[result$adj.p < 0.01], sep="\n") # print all genes w/ FDR<1%