Created
April 11, 2019 14:30
-
-
Save lcolladotor/55d3d52de3505b0bfdb5c42e0bb88b2a to your computer and use it in GitHub Desktop.
Check why some samples have scale_counts > 40 million as reported by Princy Parsana on 09/10/2018
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Check why some samples have scale_counts > 40 million as reported by Princy Parsana on 09/10/2018 | |
library('recount') | |
library('devtools') | |
load('/dcl01/leek/data/recount-website/rse/rse_tcga/TCGA/rse_gene.Rdata', verbose = TRUE) | |
rse_gene_tcga <- rse_gene | |
rse_scaled_tcga <- scale_counts(rse_gene_tcga, round = FALSE) | |
nread_tcga <- colSums(assays(rse_scaled_tcga)$counts) / 1e6 | |
table(nread_tcga > 40) | |
summary(nread_tcga) | |
# > table(nread_tcga > 40) | |
# | |
# FALSE TRUE | |
# 8030 3254 | |
# > summary(nread_tcga) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 7.809 38.455 39.433 38.582 40.092 42.734 | |
ov <- countOverlaps(rowRanges(rse_gene)) | |
table(ov) | |
# > table(ov) | |
# ov | |
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20 21 | |
# 42114 12547 2283 608 239 95 36 19 15 6 4 11 5 1 6 11 1 4 2 2 | |
# 22 23 25 26 28 60 81 96 | |
# 3 1 2 18 1 1 1 1 | |
nread_tcga_noov <- colSums(assays(rse_scaled_tcga)$counts[ov == 1, ]) / 1e6 | |
table(nread_tcga_noov > 40) | |
summary(nread_tcga_noov) | |
# > table(nread_tcga_noov > 40) | |
# | |
# FALSE | |
# 11284 | |
# > summary(nread_tcga_noov) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 5.008 25.432 26.275 25.842 26.976 32.745 | |
load('/dcl01/leek/data/recount-website/rse/rse_sra/DRP000366/rse_exon.Rdata', verbose = TRUE) | |
ov_ex <- countOverlaps(rowRanges(rse_exon)) | |
table(ov_ex) | |
table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "+"])) | |
table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "-"])) | |
# > table(ov_ex) | |
# ov_ex | |
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | |
# 576313 21592 3326 957 375 158 96 48 48 27 17 18 9 6 35 9 3 | |
# 18 19 20 21 22 23 24 25 26 28 30 31 37 47 | |
# 6 2 4 1 50 2 5 1 2 19 1 7 1 1 | |
# > table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "+"])) | |
# | |
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | |
# 290987 11075 1743 500 168 79 55 30 39 22 9 11 6 3 34 9 3 | |
# 18 19 20 21 22 23 24 25 26 28 30 31 37 47 | |
# 6 2 2 1 49 2 5 1 2 18 1 7 1 1 | |
# > table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "-"])) | |
# | |
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 20 22 | |
# 285326 10517 1583 457 207 79 41 18 9 5 8 7 3 3 1 2 1 | |
# 28 | |
# 1 | |
gene_info <- reproduce_ranges('both') | |
# > class(gene_info) | |
# [1] "list" | |
# > names(gene_info) | |
# [1] "exon" "gene" | |
# > class(gene_info$exon) | |
# [1] "CompressedGRangesList" | |
# attr(,"package") | |
# [1] "GenomicRanges" | |
ov_gene_by_exon <- countOverlaps(gene_info$exon) | |
table(ov_gene_by_exon) | |
# 1 2 3 4 5 6 7 8 9 10 11 15 16 22 | |
# 52800 4083 876 156 48 18 5 3 9 1 1 14 1 22 | |
nread_tcga_noov2 <- colSums(assays(rse_scaled_tcga)$counts[ov_gene_by_exon == 1, ]) / 1e6 | |
table(nread_tcga_noov2 > 40) | |
summary(nread_tcga_noov2) | |
# > table(nread_tcga_noov2 > 40) | |
# | |
# FALSE | |
# 11284 | |
# > summary(nread_tcga_noov2) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 6.284 31.240 32.374 31.668 33.185 36.570 | |
load('/dcl01/leek/data/recount-website/rse/rse_gtex/SRP012682/rse_gene.Rdata', verbose = TRUE) | |
rse_gene_gtex <- rse_gene | |
rse_scaled_gtex <- scale_counts(rse_gene_gtex, round = FALSE) | |
nread_gtex <- colSums(assays(rse_scaled_gtex)$counts) / 1e6 | |
table(nread_gtex > 40) | |
summary(nread_gtex) | |
# > table(nread_gtex > 40) | |
# | |
# FALSE TRUE | |
# 9349 313 | |
# > summary(nread_gtex) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 10.60 35.96 37.40 37.01 38.40 41.60 | |
nread_gtex_noov <- colSums(assays(rse_scaled_gtex)$counts[ov == 1, ]) / 1e6 | |
table(nread_gtex_noov > 40) | |
summary(nread_gtex_noov) | |
# > table(nread_gtex_noov > 40) | |
# | |
# FALSE | |
# 9662 | |
# > summary(nread_gtex_noov) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 7.34 23.54 24.68 24.67 25.64 38.59 | |
nread_gtex_noov2 <- colSums(assays(rse_scaled_gtex)$counts[ov_gene_by_exon == 1, ]) / 1e6 | |
table(nread_gtex_noov2 > 40) | |
summary(nread_gtex_noov2) | |
# > table(nread_gtex_noov2 > 40) | |
# | |
# FALSE | |
# 9662 | |
# > summary(nread_gtex_noov2) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 8.853 28.506 29.832 29.773 31.146 39.045 | |
load('/dcl01/leek/data/recount-website/rse/rse_sra/all/rse_gene.Rdata', verbose = TRUE) | |
rse_gene_sra <- rse_gene | |
rse_scaled_sra <- scale_counts(rse_gene_sra, round = FALSE) | |
nread_sra <- colSums(assays(rse_scaled_sra)$counts) / 1e6 | |
table(nread_sra > 40) | |
summary(nread_sra) | |
# > table(nread_sra > 40) | |
# | |
# FALSE TRUE | |
# 46427 3230 | |
# > summary(nread_sra) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 0.04148 26.17371 35.12996 31.17124 38.04915 78.42058 | |
nread_sra_noov <- colSums(assays(rse_scaled_sra)$counts[ov == 1, ]) / 1e6 | |
table(nread_sra_noov > 40) | |
summary(nread_sra_noov) | |
# > table(nread_sra_noov > 40) | |
# | |
# FALSE TRUE | |
# 49642 15 | |
# > summary(nread_sra_noov) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 0.00401 15.68622 22.62987 19.95534 25.02280 56.10205 | |
i <- which(nread_sra_noov > 40) | |
# > nread_sra_noov[i] | |
# SRR039611 SRR039612 SRR039614 SRR039615 SRR039616 SRR039617 SRR039618 | |
# 50.54406 49.78613 51.65518 41.18782 51.31583 50.21475 47.56245 | |
# SRR039619 SRR039620 SRR039621 SRR039624 SRR191578 SRR1044498 SRR1044598 | |
# 56.10205 42.22048 51.93728 46.13629 41.90084 40.49189 49.54119 | |
# SRR1044599 | |
# 48.83405 | |
options(width = 120) | |
colData(rse_scaled_sra)[i, c('sample', 'read_count_as_reported_by_sra', 'reads_downloaded', 'paired_end', 'mapped_read_count', 'auc')] | |
# > colData(rse_scaled_sra)[i, c('sample', 'read_count_as_reported_by_sra', 'reads_downloaded', 'paired_end', 'mapped_read_count', 'auc')] | |
# DataFrame with 15 rows and 6 columns | |
# sample read_count_as_reported_by_sra reads_downloaded paired_end mapped_read_count auc | |
# <character> <integer> <integer> <logical> <integer> <numeric> | |
# SRR039611 SRS059014 9277378 9277378 FALSE 8065356 179937063 | |
# SRR039612 SRS059015 15656640 15656640 FALSE 12701639 283726804 | |
# SRR039614 SRS059017 9431188 9431188 FALSE 8606523 191313032 | |
# SRR039615 SRS059018 9288038 9288038 FALSE 7868109 173004663 | |
# SRR039616 SRS059019 10206748 10206748 FALSE 6755063 150508814 | |
# ... ... ... ... ... ... ... | |
# SRR039624 SRS059027 16427267 16427267 FALSE 13070019 302682401 | |
# SRR191578 SRS190638 500705 500705 FALSE 385932 8663519 | |
# SRR1044498 SRS512177 3127806 3127806 FALSE 1722674 39041365 | |
# SRR1044598 SRS512276 1290214 1290214 FALSE 1167335 26490287 | |
# SRR1044599 SRS512277 1789104 1789104 FALSE 1637205 37754577 | |
colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100 | |
summary(colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100) | |
# > colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100 | |
# [1] 86.93573 81.12621 91.25598 84.71228 66.18232 69.58729 72.36412 67.26457 79.95167 82.68239 79.56295 77.07772 | |
# [13] 55.07611 90.47608 91.50977 | |
# > summary(colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 55.08 70.98 79.95 78.38 85.82 91.51 | |
nread_sra_noov2 <- colSums(assays(rse_scaled_sra)$counts[ov_gene_by_exon == 1, ]) / 1e6 | |
table(nread_sra_noov2 > 40) | |
summary(nread_sra_noov2) | |
summary(nread_sra_noov2[nread_sra_noov2 > 40]) | |
# > table(nread_sra_noov2 > 40) | |
# | |
# FALSE TRUE | |
# 49188 469 | |
# > summary(nread_sra_noov2) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 0.04019 20.59794 28.42783 25.46419 31.25098 78.40682 | |
# > summary(nread_sra_noov2[nread_sra_noov2 > 40]) | |
# Min. 1st Qu. Median Mean 3rd Qu. Max. | |
# 40.00 41.48 45.38 49.80 51.86 78.41 | |
## Reproducibility information | |
print('Reproducibility information:') | |
Sys.time() | |
proc.time() | |
options(width = 120) | |
session_info() | |
# Session info ---------------------------------------------------------------------------------------------------------- | |
# setting value | |
# version R version 3.5.0 Patched (2018-04-30 r74679) | |
# system x86_64, linux-gnu | |
# ui X11 | |
# language (EN) | |
# collate en_US.UTF-8 | |
# tz US/Eastern | |
# date 2018-09-10 | |
# | |
# Packages -------------------------------------------------------------------------------------------------------------- | |
# package * version date source | |
# acepack 1.4.1 2016-10-29 CRAN (R 3.5.0) | |
# AnnotationDbi * 1.42.1 2018-05-17 Bioconductor | |
# assertthat 0.2.0 2017-04-11 CRAN (R 3.5.0) | |
# backports 1.1.2 2017-12-13 CRAN (R 3.5.0) | |
# base * 3.5.0 2018-05-02 local | |
# base64enc 0.1-3 2015-07-28 CRAN (R 3.5.0) | |
# bibtex 0.4.2 2017-06-30 CRAN (R 3.5.0) | |
# bindr 0.1.1 2018-03-13 CRAN (R 3.5.0) | |
# bindrcpp 0.2.2 2018-03-29 CRAN (R 3.5.0) | |
# Biobase * 2.40.0 2018-05-02 Bioconductor | |
# BiocGenerics * 0.26.0 2018-05-03 Bioconductor | |
# BiocParallel * 1.14.2 2018-07-08 Bioconductor | |
# biomaRt 2.36.1 2018-06-13 Bioconductor | |
# Biostrings 2.48.0 2018-05-03 Bioconductor | |
# bit 1.1-14 2018-05-29 CRAN (R 3.5.0) | |
# bit64 0.9-7 2017-05-08 CRAN (R 3.5.0) | |
# bitops 1.0-6 2013-08-17 CRAN (R 3.5.0) | |
# blob 1.1.1 2018-03-25 CRAN (R 3.5.0) | |
# BSgenome 1.48.0 2018-05-03 Bioconductor | |
# bumphunter 1.22.0 2018-05-03 Bioconductor | |
# checkmate 1.8.5 2017-10-24 CRAN (R 3.5.0) | |
# cluster 2.0.7-1 2018-04-13 CRAN (R 3.5.0) | |
# codetools 0.2-15 2016-10-05 CRAN (R 3.5.0) | |
# colorout * 1.2-0 2018-05-02 Github (jalvesaq/colorout@c42088d) | |
# colorspace 1.3-2 2016-12-14 CRAN (R 3.5.0) | |
# compiler 3.5.0 2018-05-02 local | |
# crayon 1.3.4 2017-09-16 CRAN (R 3.5.0) | |
# data.table 1.11.4 2018-05-27 cran (@1.11.4) | |
# datasets * 3.5.0 2018-05-02 local | |
# DBI 1.0.0 2018-05-02 CRAN (R 3.5.0) | |
# DelayedArray * 0.6.2 2018-07-23 Bioconductor | |
# derfinder 1.14.0 2018-05-03 Bioconductor | |
# derfinderHelper 1.14.0 2018-05-03 Bioconductor | |
# devtools * 1.13.6 2018-06-27 CRAN (R 3.5.0) | |
# digest 0.6.15 2018-01-28 CRAN (R 3.5.0) | |
# doRNG 1.7.1 2018-06-22 CRAN (R 3.5.0) | |
# downloader 0.4 2015-07-09 CRAN (R 3.5.0) | |
# dplyr 0.7.6 2018-06-29 CRAN (R 3.5.0) | |
# foreach 1.4.4 2017-12-12 CRAN (R 3.5.0) | |
# foreign 0.8-70 2017-11-28 CRAN (R 3.5.0) | |
# Formula 1.2-3 2018-05-03 CRAN (R 3.5.0) | |
# GenomeInfoDb * 1.16.0 2018-05-03 Bioconductor | |
# GenomeInfoDbData 1.1.0 2018-04-17 Bioconductor | |
# GenomicAlignments 1.16.0 2018-05-03 Bioconductor | |
# GenomicFeatures * 1.32.0 2018-05-03 Bioconductor | |
# GenomicFiles 1.16.0 2018-05-03 Bioconductor | |
# GenomicRanges * 1.32.6 2018-07-20 Bioconductor | |
# GEOquery 2.48.0 2018-05-02 Bioconductor | |
# ggplot2 3.0.0 2018-07-03 CRAN (R 3.5.0) | |
# glue 1.3.0 2018-07-17 CRAN (R 3.5.0) | |
# graphics * 3.5.0 2018-05-02 local | |
# grDevices * 3.5.0 2018-05-02 local | |
# grid 3.5.0 2018-05-02 local | |
# gridExtra 2.3 2017-09-09 CRAN (R 3.5.0) | |
# gtable 0.2.0 2016-02-26 CRAN (R 3.5.0) | |
# Hmisc 4.1-1 2018-01-03 CRAN (R 3.5.0) | |
# hms 0.4.2 2018-03-10 CRAN (R 3.5.0) | |
# htmlTable 1.12 2018-05-26 CRAN (R 3.5.0) | |
# htmltools 0.3.6 2017-04-28 CRAN (R 3.5.0) | |
# htmlwidgets 1.2 2018-04-19 CRAN (R 3.5.0) | |
# httpuv 1.4.5 2018-07-19 CRAN (R 3.5.0) | |
# httr 1.3.1 2017-08-20 CRAN (R 3.5.0) | |
# IRanges * 2.14.10 2018-05-17 Bioconductor | |
# iterators 1.0.10 2018-07-13 CRAN (R 3.5.0) | |
# jsonlite 1.5 2017-06-01 CRAN (R 3.5.0) | |
# knitr 1.20 2018-02-20 CRAN (R 3.5.0) | |
# later 0.7.3 2018-06-08 CRAN (R 3.5.0) | |
# lattice 0.20-35 2017-03-25 CRAN (R 3.5.0) | |
# latticeExtra 0.6-28 2016-02-09 CRAN (R 3.5.0) | |
# lazyeval 0.2.1 2017-10-29 CRAN (R 3.5.0) | |
# limma 3.36.2 2018-06-21 Bioconductor | |
# locfit 1.5-9.1 2013-04-20 CRAN (R 3.5.0) | |
# magrittr 1.5 2014-11-22 CRAN (R 3.5.0) | |
# Matrix 1.2-14 2018-04-13 CRAN (R 3.5.0) | |
# matrixStats * 0.54.0 2018-07-23 CRAN (R 3.5.0) | |
# memoise 1.1.0 2017-04-21 CRAN (R 3.5.0) | |
# methods * 3.5.0 2018-05-02 local | |
# munsell 0.5.0 2018-06-12 CRAN (R 3.5.0) | |
# nnet 7.3-12 2016-02-02 CRAN (R 3.5.0) | |
# org.Hs.eg.db * 3.6.0 2018-05-03 Bioconductor | |
# parallel * 3.5.0 2018-05-02 local | |
# pillar 1.3.0 2018-07-14 CRAN (R 3.5.0) | |
# pkgconfig 2.0.1 2017-03-21 CRAN (R 3.5.0) | |
# pkgmaker 0.27 2018-05-25 CRAN (R 3.5.0) | |
# plyr 1.8.4 2016-06-08 CRAN (R 3.5.0) | |
# png 0.1-7 2013-12-03 CRAN (R 3.5.0) | |
# prettyunits 1.0.2 2015-07-13 CRAN (R 3.5.0) | |
# progress 1.2.0 2018-06-14 CRAN (R 3.5.0) | |
# promises 1.0.1 2018-04-13 CRAN (R 3.5.0) | |
# purrr 0.2.5 2018-05-29 CRAN (R 3.5.0) | |
# qvalue 2.12.0 2018-05-03 Bioconductor | |
# R6 2.2.2 2017-06-17 CRAN (R 3.5.0) | |
# RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.5.0) | |
# Rcpp 0.12.18 2018-07-23 CRAN (R 3.5.0) | |
# RCurl 1.95-4.11 2018-07-15 CRAN (R 3.5.0) | |
# readr 1.1.1 2017-05-16 CRAN (R 3.5.0) | |
# recount * 1.6.2 2018-05-17 Bioconductor | |
# registry 0.5 2017-12-03 CRAN (R 3.5.0) | |
# rentrez 1.2.1 2018-03-05 CRAN (R 3.5.0) | |
# reshape2 1.4.3 2017-12-11 CRAN (R 3.5.0) | |
# rlang 0.2.1 2018-05-30 cran (@0.2.1) | |
# rmote * 0.3.4 2018-05-02 deltarho (R 3.5.0) | |
# rngtools 1.3.1 2018-05-15 CRAN (R 3.5.0) | |
# rpart 4.1-13 2018-02-23 CRAN (R 3.5.0) | |
# Rsamtools 1.32.2 2018-07-03 Bioconductor | |
# RSQLite 2.1.1 2018-05-06 CRAN (R 3.5.0) | |
# rstudioapi 0.7 2017-09-07 CRAN (R 3.5.0) | |
# rtracklayer 1.40.3 2018-06-13 Bioconductor | |
# S4Vectors * 0.18.3 2018-06-13 Bioconductor | |
# scales 0.5.0 2017-08-24 CRAN (R 3.5.0) | |
# servr 0.10 2018-05-30 CRAN (R 3.5.0) | |
# splines 3.5.0 2018-05-02 local | |
# stats * 3.5.0 2018-05-02 local | |
# stats4 * 3.5.0 2018-05-02 local | |
# stringi 1.2.4 2018-07-20 CRAN (R 3.5.0) | |
# stringr 1.3.1 2018-05-10 CRAN (R 3.5.0) | |
# SummarizedExperiment * 1.10.1 2018-05-17 Bioconductor | |
# survival 2.42-3 2018-04-16 CRAN (R 3.5.0) | |
# tibble 1.4.2 2018-01-22 CRAN (R 3.5.0) | |
# tidyr 0.8.1 2018-05-18 CRAN (R 3.5.0) | |
# tidyselect 0.2.4 2018-02-26 CRAN (R 3.5.0) | |
# tools 3.5.0 2018-05-02 local | |
# utils * 3.5.0 2018-05-02 local | |
# VariantAnnotation 1.26.1 2018-07-04 Bioconductor | |
# withr 2.1.2 2018-03-15 CRAN (R 3.5.0) | |
# xfun 0.3 2018-07-06 CRAN (R 3.5.0) | |
# XML 3.98-1.12 2018-07-15 CRAN (R 3.5.0) | |
# xml2 1.2.0 2018-01-24 CRAN (R 3.5.0) | |
# xtable 1.8-2 2016-02-05 CRAN (R 3.5.0) | |
# XVector 0.20.0 2018-05-03 Bioconductor | |
# zlibbioc 1.26.0 2018-05-02 Bioconductor |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment