Last active
December 11, 2019 15:25
-
-
Save arraytools/d12a016f9393468fba046ae83bcb5536 to your computer and use it in GitHub Desktop.
import TPM for gene level analysis in DESeq2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a note about import rsem-generated file for DESeq2 package | |
# As described by the tximport's vignette, the method below uses the gene-level estimated counts from the quantification tools, and additionally to use the transcript-level abundance estimates to calculate a gene-level offset that corrects for changes to the average transcript length across samples. | |
# This approach corrects for potential changes in gene length across samples (e.g. from differential isoform usage) | |
# References: | |
# 1. http://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html#rsem | |
# 2. https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#differential-expression-analysis | |
if (!requireNamespace("BiocManager", quietly = TRUE)) | |
install.packages("BiocManager") | |
BiocManager::install("tximportData") | |
BiocManager::install("tximport") | |
BiocManager::install("DESeq2") | |
library(tximportData) | |
dir <- system.file("extdata", package = "tximportData") | |
list.files(dir) | |
samples <- read.table(file.path(dir, "samples.txt"), header = TRUE) | |
samples | |
files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".genes.results.gz")) | |
names(files) <- paste0("sample", 1:6) | |
if (Sys.info()['sysname'] == "Linux") { | |
system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188021/ERR188021.genes.results.gz | head -3") | |
# gene_id transcript_id(s) length effective_length expected_count TPM FPKM | |
# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 1749.40 1574.16 0.00 0.00 0.00 | |
# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 765.28 0.00 0.00 0.00 | |
system("zcat $HOME/R/x86_64-pc-linux-gnu-library/3.6/tximportData/extdata/rsem/ERR188356/ERR188356.genes.results.gz | head -3") | |
# gene_id transcript_id(s) length effective_length expected_count TPM FPKM | |
# ENSG00000000003.14 ENST00000373020.8,ENST00000494424.1,ENST00000496771.5,ENST00000612152.4,ENST00000614008.4 2206.00 1999.94 1.00 0.03 0.02 | |
# ENSG00000000005.5 ENST00000373031.4,ENST00000485971.1 940.50 734.46 0.00 0.00 0.00 | |
} | |
library(tximport) | |
txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE) | |
txi.rsem$length[txi.rsem$length == 0] <- 1 # https://support.bioconductor.org/p/92763/ | |
head(txi.rsem$counts) | |
# sample1 sample2 sample3 sample4 sample5 sample6 | |
# ENSG00000000003.14 0.00 2.00 21.00 3.00 0 1.00 | |
# ENSG00000000005.5 0.00 0.00 0.00 0.00 0 0.00 | |
# ENSG00000000419.12 1000.00 1250.00 1377.00 1197.00 0 1254.00 | |
# ENSG00000000457.13 401.48 457.55 511.17 337.52 1 474.38 | |
# ENSG00000000460.16 613.72 407.84 1119.94 556.23 2 603.25 | |
# ENSG00000000938.12 2387.00 3466.00 2904.00 2431.00 3 1720.00 | |
dim(txi.rsem$counts) # [1] 58288 6 | |
library(DESeq2) | |
sampleTable <- data.frame(condition = factor(rep(c("A", "B"), each = 3))) | |
rownames(sampleTable) <- colnames(txi.rsem$counts) | |
dds <- DESeqDataSetFromTximport(txi.rsem, sampleTable, ~condition) | |
dds <- DESeq(dds) | |
res <- results(dds) | |
res | |
# log2 fold change (MLE): condition B vs A | |
# Wald test p-value: condition B vs A | |
# DataFrame with 58288 rows and 6 columns | |
# baseMean log2FoldChange lfcSE | |
# <numeric> <numeric> <numeric> | |
# ENSG00000000003.14 1.22461373745634 0.743778471964763 3.80798626094963 | |
# ENSG00000000005.5 0 NA NA | |
# ENSG00000000419.12 376.029957078582 -0.0106113773699245 0.536830780985092 | |
# ENSG00000000457.13 165.707130884913 -0.0477305450686189 0.638050726768135 | |
# ENSG00000000460.16 257.467324790535 0.0708126664641672 0.58214645493852 | |
# ... ... ... ... | |
# ENSG00000284744.1 0.275449760566316 3.48146567693683 3.90782583532975 | |
# ENSG00000284745.1 0 NA NA | |
# ENSG00000284746.1 0 NA NA | |
# ENSG00000284747.1 4.35498104202029 1.67857572933429 2.63793646859883 | |
# ENSG00000284748.1 0 NA NA | |
# stat pvalue padj | |
# <numeric> <numeric> <numeric> | |
# ENSG00000000003.14 0.195320681587564 0.845141907303312 0.999586943149473 | |
# ENSG00000000005.5 NA NA NA | |
# ENSG00000000419.12 -0.0197667081430251 0.984229475745241 0.999586943149473 | |
# ENSG00000000457.13 -0.0748068187468173 0.94036841644668 0.999586943149473 | |
# ENSG00000000460.16 0.12164063847412 0.903183627600017 0.999586943149473 | |
# ... ... ... ... | |
# ENSG00000284744.1 0.890895813590694 0.372985066426012 0.999586943149473 | |
# ENSG00000284745.1 NA NA NA | |
# ENSG00000284746.1 NA NA NA | |
# ENSG00000284747.1 0.636321514682226 0.524566884764994 0.999586943149473 | |
# ENSG00000284748.1 NA NA NA | |
# If we import transcript level counts, we will not be able to do analysis on gene level | |
files <- file.path(dir, "rsem", samples$run, paste0(samples$run, ".isoforms.results.gz")) | |
names(files) <- paste0("sample", 1:6) | |
txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = TRUE) | |
head(txi.rsem$counts) | |
dim(txi.rsem$counts) # [1] 200401 6 | |
txi.rsem <- tximport(files, type = "rsem", txIn = TRUE, txOut = FALSE) | |
# reading in files with read_tsv | |
# Error in summarizeFail() : | |
# | |
# tximport failed at summarizing to the gene-level. | |
# Please see 'Solutions' in the Details section of the man page: ?tximport |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment