Skip to content

Instantly share code, notes, and snippets.

@arraytools
Last active December 11, 2019 15:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arraytools/d12a016f9393468fba046ae83bcb5536 to your computer and use it in GitHub Desktop.
Save arraytools/d12a016f9393468fba046ae83bcb5536 to your computer and use it in GitHub Desktop.
import TPM for gene level analysis in DESeq2
# This is a note about import rsem-generated file for DESeq2 package
# As described by the tximport's vignette, the method below uses the gene-level estimated counts from the quantification tools, and additionally to use the transcript-level abundance estimates to calculate a gene-level offset that corrects for changes to the average transcript length across samples.
# This approach corrects for potential changes in gene length across samples (e.g. from differential isoform usage)
# References:
# 1. http://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html#rsem
# 2. https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#differential-expression-analysis
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("tximportData")
BiocManager::install("tximport")
BiocManager::install("DESeq2")
library(tximportData)
dir <- system.file("extdata", package = "tximportData")
list.files(dir)
samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
samples
files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".genes.results.gz"))
names(files) <- paste0("sample", 1:6)
if (Sys.info()['sysname'] == "Linux") {
system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188021/ERR188021.genes.results.gz | head -3")
# gene_id transcript_id(s) length effective_length expected_count TPM FPKM
# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 1749.40 1574.16 0.00 0.00 0.00
# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 765.28 0.00 0.00 0.00
system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188356/ERR188356.genes.results.gz | head -3")
# gene_id transcript_id(s) length effective_length expected_count TPM FPKM
# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 2206.00 1999.94 1.00 0.03 0.02
# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 734.46 0.00 0.00 0.00
}
library(tximport)
txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)
txi.rsem$length[txi.rsem$length == 0] <- 1 # https://support.bioconductor.org/p/92763/
head(txi.rsem$counts)
# sample1 sample2 sample3 sample4 sample5 sample6
# ENSG00000000003.14 0.00 2.00 21.00 3.00 0 1.00
# ENSG00000000005.5 0.00 0.00 0.00 0.00 0 0.00
# ENSG00000000419.12 1000.00 1250.00 1377.00 1197.00 0 1254.00
# ENSG00000000457.13 401.48 457.55 511.17 337.52 1 474.38
# ENSG00000000460.16 613.72 407.84 1119.94 556.23 2 603.25
# ENSG00000000938.12 2387.00 3466.00 2904.00 2431.00 3 1720.00
dim(txi.rsem$counts) # [1] 58288 6
library(DESeq2)
sampleTable <- data.frame(condition = factor(rep(c("A", "B"), each = 3)))
rownames(sampleTable) <- colnames(txi.rsem$counts)
dds <- DESeqDataSetFromTximport(txi.rsem, sampleTable, ~condition)
dds <- DESeq(dds)
res <- results(dds)
res
# log2 fold change (MLE): condition B vs A
# Wald test p-value: condition B vs A
# DataFrame with 58288 rows and 6 columns
# baseMean log2FoldChange lfcSE
# <numeric> <numeric> <numeric>
# ENSG00000000003.14 1.22461373745634 0.743778471964763 3.80798626094963
# ENSG00000000005.5 0 NA NA
# ENSG00000000419.12 376.029957078582 -0.0106113773699245 0.536830780985092
# ENSG00000000457.13 165.707130884913 -0.0477305450686189 0.638050726768135
# ENSG00000000460.16 257.467324790535 0.0708126664641672 0.58214645493852
# ... ... ... ...
# ENSG00000284744.1 0.275449760566316 3.48146567693683 3.90782583532975
# ENSG00000284745.1 0 NA NA
# ENSG00000284746.1 0 NA NA
# ENSG00000284747.1 4.35498104202029 1.67857572933429 2.63793646859883
# ENSG00000284748.1 0 NA NA
# stat pvalue padj
# <numeric> <numeric> <numeric>
# ENSG00000000003.14 0.195320681587564 0.845141907303312 0.999586943149473
# ENSG00000000005.5 NA NA NA
# ENSG00000000419.12 -0.0197667081430251 0.984229475745241 0.999586943149473
# ENSG00000000457.13 -0.0748068187468173 0.94036841644668 0.999586943149473
# ENSG00000000460.16 0.12164063847412 0.903183627600017 0.999586943149473
# ... ... ... ...
# ENSG00000284744.1 0.890895813590694 0.372985066426012 0.999586943149473
# ENSG00000284745.1 NA NA NA
# ENSG00000284746.1 NA NA NA
# ENSG00000284747.1 0.636321514682226 0.524566884764994 0.999586943149473
# ENSG00000284748.1 NA NA NA
# If we import transcript level counts, we will not be able to do analysis on gene level
files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".isoforms.results.gz"))
names(files) <- paste0("sample", 1:6)
txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = TRUE)
head(txi.rsem$counts)
dim(txi.rsem$counts) # [1] 200401 6
txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = FALSE)
# reading in files with read_tsv
# Error in summarizeFail() :
#
# tximport failed at summarizing to the gene-level.
# Please see 'Solutions' in the Details section of the man page: ?tximport
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment