arraytools/TPM_rsem_tximport_DESeq2.R

## TPM_rsem_tximport_DESeq2.R
# This is a note about import rsem-generated file for DESeq2 package

# As described by the tximport's vignette, the method below uses the gene-level estimated counts from the quantification tools, and additionally to use the transcript-level abundance estimates to calculate a gene-level offset that corrects for changes to the average transcript length across samples.
# This approach corrects for potential changes in gene length across samples (e.g. from differential isoform usage)

# References:
# 1. http://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html#rsem
# 2. https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#differential-expression-analysis


if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("tximportData")
BiocManager::install("tximport")
BiocManager::install("DESeq2")

library(tximportData)
dir <- system.file("extdata", package = "tximportData")
list.files(dir)

samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
samples

files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".genes.results.gz"))
names(files) <- paste0("sample", 1:6)
if (Sys.info()['sysname'] == "Linux") {
  system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188021/ERR188021.genes.results.gz | head -3")
  # gene_id	transcript_id(s)	length	effective_length	expected_count	TPM	FPKM
  # ENSG00000000003.14	ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4	1749.40	1574.16	0.00	0.00	0.00
  # ENSG00000000005.5	ENST00000373031.4,ENST00000485971.1	940.50	765.28	0.00	0.00	0.00
  system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188356/ERR188356.genes.results.gz | head -3")
  # gene_id	transcript_id(s)	length	effective_length	expected_count	TPM	FPKM
  # ENSG00000000003.14	ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4	2206.00	1999.94	1.00	0.03	0.02
  # ENSG00000000005.5	ENST00000373031.4,ENST00000485971.1	940.50	734.46	0.00	0.00	0.00
}


library(tximport)
txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)

txi.rsem$length[txi.rsem$length == 0] <- 1 # https://support.bioconductor.org/p/92763/
head(txi.rsem$counts)
#                    sample1 sample2 sample3 sample4 sample5 sample6
# ENSG00000000003.14    0.00    2.00   21.00    3.00       0    1.00
# ENSG00000000005.5     0.00    0.00    0.00    0.00       0    0.00
# ENSG00000000419.12 1000.00 1250.00 1377.00 1197.00       0 1254.00
# ENSG00000000457.13  401.48  457.55  511.17  337.52       1  474.38
# ENSG00000000460.16  613.72  407.84 1119.94  556.23       2  603.25
# ENSG00000000938.12 2387.00 3466.00 2904.00 2431.00       3 1720.00
dim(txi.rsem$counts) # [1] 58288     6


library(DESeq2)
sampleTable <- data.frame(condition = factor(rep(c("A", "B"), each = 3)))
rownames(sampleTable) <- colnames(txi.rsem$counts)
dds <- DESeqDataSetFromTximport(txi.rsem, sampleTable, ~condition)
dds <- DESeq(dds)
res <- results(dds)
res
# log2 fold change (MLE): condition B vs A
# Wald test p-value: condition B vs A
# DataFrame with 58288 rows and 6 columns
# baseMean      log2FoldChange             lfcSE
# <numeric>           <numeric>         <numeric>
#   ENSG00000000003.14  1.22461373745634   0.743778471964763  3.80798626094963
# ENSG00000000005.5                  0                  NA                NA
# ENSG00000000419.12  376.029957078582 -0.0106113773699245 0.536830780985092
# ENSG00000000457.13  165.707130884913 -0.0477305450686189 0.638050726768135
# ENSG00000000460.16  257.467324790535  0.0708126664641672  0.58214645493852
# ...                              ...                 ...               ...
# ENSG00000284744.1  0.275449760566316    3.48146567693683  3.90782583532975
# ENSG00000284745.1                  0                  NA                NA
# ENSG00000284746.1                  0                  NA                NA
# ENSG00000284747.1   4.35498104202029    1.67857572933429  2.63793646859883
# ENSG00000284748.1                  0                  NA                NA
# stat            pvalue              padj
# <numeric>         <numeric>         <numeric>
#   ENSG00000000003.14   0.195320681587564 0.845141907303312 0.999586943149473
# ENSG00000000005.5                   NA                NA                NA
# ENSG00000000419.12 -0.0197667081430251 0.984229475745241 0.999586943149473
# ENSG00000000457.13 -0.0748068187468173  0.94036841644668 0.999586943149473
# ENSG00000000460.16    0.12164063847412 0.903183627600017 0.999586943149473
# ...                                ...               ...               ...
# ENSG00000284744.1    0.890895813590694 0.372985066426012 0.999586943149473
# ENSG00000284745.1                   NA                NA                NA
# ENSG00000284746.1                   NA                NA                NA
# ENSG00000284747.1    0.636321514682226 0.524566884764994 0.999586943149473
# ENSG00000284748.1                   NA                NA                NA

# If we import transcript level counts, we will not be able to do analysis on gene level
files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".isoforms.results.gz"))
names(files) <- paste0("sample", 1:6)
txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = TRUE)
head(txi.rsem$counts)
dim(txi.rsem$counts) # [1] 200401      6

txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = FALSE)
# reading in files with read_tsv
# Error in summarizeFail() :
#
#   tximport failed at summarizing to the gene-level.
# Please see 'Solutions' in the Details section of the man page: ?tximport
	# This is a note about import rsem-generated file for DESeq2 package

	# As described by the tximport's vignette, the method below uses the gene-level estimated counts from the quantification tools, and additionally to use the transcript-level abundance estimates to calculate a gene-level offset that corrects for changes to the average transcript length across samples.
	# This approach corrects for potential changes in gene length across samples (e.g. from differential isoform usage)

	# References:
	# 1. http://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html#rsem
	# 2. https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#differential-expression-analysis


	if (!requireNamespace("BiocManager", quietly = TRUE))
	install.packages("BiocManager")
	BiocManager::install("tximportData")
	BiocManager::install("tximport")
	BiocManager::install("DESeq2")

	library(tximportData)
	dir <- system.file("extdata", package = "tximportData")
	list.files(dir)

	samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
	samples

	files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".genes.results.gz"))
	names(files) <- paste0("sample", 1:6)
	if (Sys.info()['sysname'] == "Linux") {
	system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188021/ERR188021.genes.results.gz \| head -3")
	# gene_id transcript_id(s) length effective_length expected_count TPM FPKM
	# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 1749.40 1574.16 0.00 0.00 0.00
	# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 765.28 0.00 0.00 0.00
	system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188356/ERR188356.genes.results.gz \| head -3")
	# gene_id transcript_id(s) length effective_length expected_count TPM FPKM
	# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 2206.00 1999.94 1.00 0.03 0.02
	# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 734.46 0.00 0.00 0.00
	}


	library(tximport)
	txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)

	txi.rsem$length[txi.rsem$length == 0] <- 1 # https://support.bioconductor.org/p/92763/
	head(txi.rsem$counts)
	# sample1 sample2 sample3 sample4 sample5 sample6
	# ENSG00000000003.14 0.00 2.00 21.00 3.00 0 1.00
	# ENSG00000000005.5 0.00 0.00 0.00 0.00 0 0.00
	# ENSG00000000419.12 1000.00 1250.00 1377.00 1197.00 0 1254.00
	# ENSG00000000457.13 401.48 457.55 511.17 337.52 1 474.38
	# ENSG00000000460.16 613.72 407.84 1119.94 556.23 2 603.25
	# ENSG00000000938.12 2387.00 3466.00 2904.00 2431.00 3 1720.00
	dim(txi.rsem$counts) # [1] 58288 6


	library(DESeq2)
	sampleTable <- data.frame(condition = factor(rep(c("A", "B"), each = 3)))
	rownames(sampleTable) <- colnames(txi.rsem$counts)
	dds <- DESeqDataSetFromTximport(txi.rsem, sampleTable, ~condition)
	dds <- DESeq(dds)
	res <- results(dds)
	res
	# log2 fold change (MLE): condition B vs A
	# Wald test p-value: condition B vs A
	# DataFrame with 58288 rows and 6 columns
	# baseMean log2FoldChange lfcSE
	# <numeric> <numeric> <numeric>
	# ENSG00000000003.14 1.22461373745634 0.743778471964763 3.80798626094963
	# ENSG00000000005.5 0 NA NA
	# ENSG00000000419.12 376.029957078582 -0.0106113773699245 0.536830780985092
	# ENSG00000000457.13 165.707130884913 -0.0477305450686189 0.638050726768135
	# ENSG00000000460.16 257.467324790535 0.0708126664641672 0.58214645493852
	# ... ... ... ...
	# ENSG00000284744.1 0.275449760566316 3.48146567693683 3.90782583532975
	# ENSG00000284745.1 0 NA NA
	# ENSG00000284746.1 0 NA NA
	# ENSG00000284747.1 4.35498104202029 1.67857572933429 2.63793646859883
	# ENSG00000284748.1 0 NA NA
	# stat pvalue padj
	# <numeric> <numeric> <numeric>
	# ENSG00000000003.14 0.195320681587564 0.845141907303312 0.999586943149473
	# ENSG00000000005.5 NA NA NA
	# ENSG00000000419.12 -0.0197667081430251 0.984229475745241 0.999586943149473
	# ENSG00000000457.13 -0.0748068187468173 0.94036841644668 0.999586943149473
	# ENSG00000000460.16 0.12164063847412 0.903183627600017 0.999586943149473
	# ... ... ... ...
	# ENSG00000284744.1 0.890895813590694 0.372985066426012 0.999586943149473
	# ENSG00000284745.1 NA NA NA
	# ENSG00000284746.1 NA NA NA
	# ENSG00000284747.1 0.636321514682226 0.524566884764994 0.999586943149473
	# ENSG00000284748.1 NA NA NA

	# If we import transcript level counts, we will not be able to do analysis on gene level
	files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".isoforms.results.gz"))
	names(files) <- paste0("sample", 1:6)
	txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = TRUE)
	head(txi.rsem$counts)
	dim(txi.rsem$counts) # [1] 200401 6

	txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = FALSE)
	# reading in files with read_tsv
	# Error in summarizeFail() :
	#
	# tximport failed at summarizing to the gene-level.
	# Please see 'Solutions' in the Details section of the man page: ?tximport