/RNASeq.R

## RNASeq.R
args <- commandArgs(TRUE)
## inputs
countsTsv <- args[1] #"countsData.txt"
designTsv <- args[2] # "design.txt"
organism <- args[3] #"Fly"                        # one of names(org), below
minimumCountsPerMillion <- as.numeric(args[4]) #1L
minimumSamplesPerTranscript <- as.numeric(args[5]) #1L
dispersion <- args[6] #"Tagwise"                  # one of Tagwise, Common
pdf_outfile <- args[7]
csv_outfile <- args[8]

## set-up
org <-
    setNames(c("org.Ag.eg.db", "org.At.tair.db", "org.Bt.eg.db",
               "org.Ce.eg.db", "org.Cf.eg.db", "org.Dm.eg.db",
               "org.Dr.eg.db", "org.Gg.eg.db", "org.Hs.eg.db",
               "org.Mm.eg.db", "org.Mmu.eg.db", "org.Pf.plasmo.db",
               "org.Pt.eg.db", "org.Rn.eg.db", "org.Sc.sgd.db",
               "org.Sco.eg.db", "org.Ss.eg.db", "org.Xl.eg.db"),
             c("Anopheles", "Arabidopsis", "Bovine", "Worm", "Canine",
               "Fly", "Zebrafish", "Chicken", "Human", "Mouse",
               "Rhesus", "Malaria", "Chimp", "Rat", "Yeast",
               "Streptomyces coelicolor", "Pig",
               "Xenopus"))
if (!organism %in% names(org))
    stop("unknown organism: ", organism)
outputs <- new.env(parent=emptyenv())

library(edgeR)
library(goseq)
require(org[organism], character.only=TRUE)

## data iput
counts <- tryCatch({
    as.matrix(read.delim(countsTsv, row.names=1))
}, error=function(err) {
    stop("failed to read count data: ", conditionMessage(err))
})

design <- tryCatch({
    csv <- read.delim(designTsv, row.names=1)
    if (!all(c("SampleId", "Group") %in% names(csv)))
        stop("'SampleId' and 'Group' columns must be present in 'design' csv")
    csv
}, error=function(err) {
    stop("failed to read experimental design data: ",
         conditionMessage(err))
})
map <- with(design, setNames(Group, SampleId))

group <- tryCatch({
    if (!all(names(map) %in% colnames(counts)))
        stop("not all 'SampleId' of 'design' in column names of 'counts'")
    if (!all(colnames(counts) %in% names(map)))
        stop("not all column names of 'counts' in 'SampleId'  of 'design'")
    if (2L != length(unique(map)))
        stop("'Group' of 'design' must have exactly two levels")
    map[colnames(counts)]
}, error=function(err) {
    stop("failed to identify groups: ", conditionMessage(err))
})

## pre-processing
library(edgeR)
dge1 <- tryCatch({
    dge0 <- DGEList(counts, group=group)
    calcNormFactors(dge0)
}, error=function(err) {
    stop("failed to create 'DGEList' data structure: ",
         conditionMessage(err))
})
dge <- tryCatch({
    m <- sweep(dge1$counts, 2, 1e6 / dge1$samples$lib.size, '*')
    ridx <- rowSums(m > minimumCountsPerMillion) > minimumSamplesPerTranscript
    dge1[ridx,]
}, error=function(err) {
    stop("failed to pre-filter transcripts: ",
         conditionMessage(err))
})

outputs[["RetainedAfterPreprocessing"]] <- table(ridx)
outputs[["MDSPlot"]] <- plotMDS.DGEList(dge)


## dispersion
model <- tryCatch({
    with(design, model.matrix( ~ Group))
}, error=function(err) {
    stop("failed to create model matrix: ",
         conditionMessage(err))
})
dge <- tryCatch({
    switch(dispersion,
           Tagwise=estimateTagwiseDisp(dge),
           Common=estimateCommonDisp(dge, model))
}, error=function(err) {
    stop("failed to calculate dispersions: ",
         conditionMessage(err))
})
outputs[["DispersionSD"]] <- sqrt(dge$common.dispersion)

## fit, likelihood ratio test, top-table
tryCatch({
    fit <- glmFit(dge, model, dispersion=dge$common.dispersion)
    lrTest <- glmLRT(dge, fit, coef=2)
    tt <- topTags(lrTest, Inf)
}, error=function(err) {
    stop("failed to fit model: ", conditionMessage(err))
})

## summary
topTable <- tryCatch({
    grpMeans <- sapply(levels(group), function(grp, counts, map) {
        cidx <- names(map)[map %in% grp]
        x <- counts[,cidx]
        rowMeans(counts[,cidx])
    }, counts, map)
    colnames(grpMeans) <- sprintf("%sMeanCount", colnames(grpMeans))
    ridx <- rownames(tt$table)
    cbind(tt$table, grpMeans[ridx,])
}, error=function(err) {
    stop("failed to annotate results: ", conditionMessage(err))
})
outputs[["TopTable"]] <-topTable[order(topTable$FDR),]
pdf(pdf_outfile); plot(outputs[["MDSPlot"]]); dev.off()
write.csv(outputs$TopTable, file=csv_outfile)
	args <- commandArgs(TRUE)
	## inputs
	countsTsv <- args[1] #"countsData.txt"
	designTsv <- args[2] # "design.txt"
	organism <- args[3] #"Fly" # one of names(org), below
	minimumCountsPerMillion <- as.numeric(args[4]) #1L
	minimumSamplesPerTranscript <- as.numeric(args[5]) #1L
	dispersion <- args[6] #"Tagwise" # one of Tagwise, Common
	pdf_outfile <- args[7]
	csv_outfile <- args[8]

	## set-up
	org <-
	setNames(c("org.Ag.eg.db", "org.At.tair.db", "org.Bt.eg.db",
	"org.Ce.eg.db", "org.Cf.eg.db", "org.Dm.eg.db",
	"org.Dr.eg.db", "org.Gg.eg.db", "org.Hs.eg.db",
	"org.Mm.eg.db", "org.Mmu.eg.db", "org.Pf.plasmo.db",
	"org.Pt.eg.db", "org.Rn.eg.db", "org.Sc.sgd.db",
	"org.Sco.eg.db", "org.Ss.eg.db", "org.Xl.eg.db"),
	c("Anopheles", "Arabidopsis", "Bovine", "Worm", "Canine",
	"Fly", "Zebrafish", "Chicken", "Human", "Mouse",
	"Rhesus", "Malaria", "Chimp", "Rat", "Yeast",
	"Streptomyces coelicolor", "Pig",
	"Xenopus"))
	if (!organism %in% names(org))
	stop("unknown organism: ", organism)
	outputs <- new.env(parent=emptyenv())

	library(edgeR)
	library(goseq)
	require(org[organism], character.only=TRUE)

	## data iput
	counts <- tryCatch({
	as.matrix(read.delim(countsTsv, row.names=1))
	}, error=function(err) {
	stop("failed to read count data: ", conditionMessage(err))
	})

	design <- tryCatch({
	csv <- read.delim(designTsv, row.names=1)
	if (!all(c("SampleId", "Group") %in% names(csv)))
	stop("'SampleId' and 'Group' columns must be present in 'design' csv")
	csv
	}, error=function(err) {
	stop("failed to read experimental design data: ",
	conditionMessage(err))
	})
	map <- with(design, setNames(Group, SampleId))

	group <- tryCatch({
	if (!all(names(map) %in% colnames(counts)))
	stop("not all 'SampleId' of 'design' in column names of 'counts'")
	if (!all(colnames(counts) %in% names(map)))
	stop("not all column names of 'counts' in 'SampleId' of 'design'")
	if (2L != length(unique(map)))
	stop("'Group' of 'design' must have exactly two levels")
	map[colnames(counts)]
	}, error=function(err) {
	stop("failed to identify groups: ", conditionMessage(err))
	})

	## pre-processing
	library(edgeR)
	dge1 <- tryCatch({
	dge0 <- DGEList(counts, group=group)
	calcNormFactors(dge0)
	}, error=function(err) {
	stop("failed to create 'DGEList' data structure: ",
	conditionMessage(err))
	})
	dge <- tryCatch({
	m <- sweep(dge1$counts, 2, 1e6 / dge1$samples$lib.size, '*')
	ridx <- rowSums(m > minimumCountsPerMillion) > minimumSamplesPerTranscript
	dge1[ridx,]
	}, error=function(err) {
	stop("failed to pre-filter transcripts: ",
	conditionMessage(err))
	})

	outputs[["RetainedAfterPreprocessing"]] <- table(ridx)
	outputs[["MDSPlot"]] <- plotMDS.DGEList(dge)


	## dispersion
	model <- tryCatch({
	with(design, model.matrix( ~ Group))
	}, error=function(err) {
	stop("failed to create model matrix: ",
	conditionMessage(err))
	})
	dge <- tryCatch({
	switch(dispersion,
	Tagwise=estimateTagwiseDisp(dge),
	Common=estimateCommonDisp(dge, model))
	}, error=function(err) {
	stop("failed to calculate dispersions: ",
	conditionMessage(err))
	})
	outputs[["DispersionSD"]] <- sqrt(dge$common.dispersion)

	## fit, likelihood ratio test, top-table
	tryCatch({
	fit <- glmFit(dge, model, dispersion=dge$common.dispersion)
	lrTest <- glmLRT(dge, fit, coef=2)
	tt <- topTags(lrTest, Inf)
	}, error=function(err) {
	stop("failed to fit model: ", conditionMessage(err))
	})

	## summary
	topTable <- tryCatch({
	grpMeans <- sapply(levels(group), function(grp, counts, map) {
	cidx <- names(map)[map %in% grp]
	x <- counts[,cidx]
	rowMeans(counts[,cidx])
	}, counts, map)
	colnames(grpMeans) <- sprintf("%sMeanCount", colnames(grpMeans))
	ridx <- rownames(tt$table)
	cbind(tt$table, grpMeans[ridx,])
	}, error=function(err) {
	stop("failed to annotate results: ", conditionMessage(err))
	})
	outputs[["TopTable"]] <-topTable[order(topTable$FDR),]
	pdf(pdf_outfile); plot(outputs[["MDSPlot"]]); dev.off()
	write.csv(outputs$TopTable, file=csv_outfile)