Skip to content

Instantly share code, notes, and snippets.

@lcolladotor
Created April 11, 2019 14:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lcolladotor/55d3d52de3505b0bfdb5c42e0bb88b2a to your computer and use it in GitHub Desktop.
Save lcolladotor/55d3d52de3505b0bfdb5c42e0bb88b2a to your computer and use it in GitHub Desktop.
Check why some samples have scale_counts > 40 million as reported by Princy Parsana on 09/10/2018
## Check why some samples have scale_counts > 40 million as reported by Princy Parsana on 09/10/2018
library('recount')
library('devtools')
load('/dcl01/leek/data/recount-website/rse/rse_tcga/TCGA/rse_gene.Rdata', verbose = TRUE)
rse_gene_tcga <- rse_gene
rse_scaled_tcga <- scale_counts(rse_gene_tcga, round = FALSE)
nread_tcga <- colSums(assays(rse_scaled_tcga)$counts) / 1e6
table(nread_tcga > 40)
summary(nread_tcga)
# > table(nread_tcga > 40)
#
# FALSE TRUE
# 8030 3254
# > summary(nread_tcga)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 7.809 38.455 39.433 38.582 40.092 42.734
ov <- countOverlaps(rowRanges(rse_gene))
table(ov)
# > table(ov)
# ov
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20 21
# 42114 12547 2283 608 239 95 36 19 15 6 4 11 5 1 6 11 1 4 2 2
# 22 23 25 26 28 60 81 96
# 3 1 2 18 1 1 1 1
nread_tcga_noov <- colSums(assays(rse_scaled_tcga)$counts[ov == 1, ]) / 1e6
table(nread_tcga_noov > 40)
summary(nread_tcga_noov)
# > table(nread_tcga_noov > 40)
#
# FALSE
# 11284
# > summary(nread_tcga_noov)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 5.008 25.432 26.275 25.842 26.976 32.745
load('/dcl01/leek/data/recount-website/rse/rse_sra/DRP000366/rse_exon.Rdata', verbose = TRUE)
ov_ex <- countOverlaps(rowRanges(rse_exon))
table(ov_ex)
table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "+"]))
table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "-"]))
# > table(ov_ex)
# ov_ex
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# 576313 21592 3326 957 375 158 96 48 48 27 17 18 9 6 35 9 3
# 18 19 20 21 22 23 24 25 26 28 30 31 37 47
# 6 2 4 1 50 2 5 1 2 19 1 7 1 1
# > table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "+"]))
#
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# 290987 11075 1743 500 168 79 55 30 39 22 9 11 6 3 34 9 3
# 18 19 20 21 22 23 24 25 26 28 30 31 37 47
# 6 2 2 1 49 2 5 1 2 18 1 7 1 1
# > table(countOverlaps(rowRanges(rse_exon)[strand(rowRanges(rse_exon)) == "-"]))
#
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 20 22
# 285326 10517 1583 457 207 79 41 18 9 5 8 7 3 3 1 2 1
# 28
# 1
gene_info <- reproduce_ranges('both')
# > class(gene_info)
# [1] "list"
# > names(gene_info)
# [1] "exon" "gene"
# > class(gene_info$exon)
# [1] "CompressedGRangesList"
# attr(,"package")
# [1] "GenomicRanges"
ov_gene_by_exon <- countOverlaps(gene_info$exon)
table(ov_gene_by_exon)
# 1 2 3 4 5 6 7 8 9 10 11 15 16 22
# 52800 4083 876 156 48 18 5 3 9 1 1 14 1 22
nread_tcga_noov2 <- colSums(assays(rse_scaled_tcga)$counts[ov_gene_by_exon == 1, ]) / 1e6
table(nread_tcga_noov2 > 40)
summary(nread_tcga_noov2)
# > table(nread_tcga_noov2 > 40)
#
# FALSE
# 11284
# > summary(nread_tcga_noov2)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.284 31.240 32.374 31.668 33.185 36.570
load('/dcl01/leek/data/recount-website/rse/rse_gtex/SRP012682/rse_gene.Rdata', verbose = TRUE)
rse_gene_gtex <- rse_gene
rse_scaled_gtex <- scale_counts(rse_gene_gtex, round = FALSE)
nread_gtex <- colSums(assays(rse_scaled_gtex)$counts) / 1e6
table(nread_gtex > 40)
summary(nread_gtex)
# > table(nread_gtex > 40)
#
# FALSE TRUE
# 9349 313
# > summary(nread_gtex)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 10.60 35.96 37.40 37.01 38.40 41.60
nread_gtex_noov <- colSums(assays(rse_scaled_gtex)$counts[ov == 1, ]) / 1e6
table(nread_gtex_noov > 40)
summary(nread_gtex_noov)
# > table(nread_gtex_noov > 40)
#
# FALSE
# 9662
# > summary(nread_gtex_noov)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 7.34 23.54 24.68 24.67 25.64 38.59
nread_gtex_noov2 <- colSums(assays(rse_scaled_gtex)$counts[ov_gene_by_exon == 1, ]) / 1e6
table(nread_gtex_noov2 > 40)
summary(nread_gtex_noov2)
# > table(nread_gtex_noov2 > 40)
#
# FALSE
# 9662
# > summary(nread_gtex_noov2)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 8.853 28.506 29.832 29.773 31.146 39.045
load('/dcl01/leek/data/recount-website/rse/rse_sra/all/rse_gene.Rdata', verbose = TRUE)
rse_gene_sra <- rse_gene
rse_scaled_sra <- scale_counts(rse_gene_sra, round = FALSE)
nread_sra <- colSums(assays(rse_scaled_sra)$counts) / 1e6
table(nread_sra > 40)
summary(nread_sra)
# > table(nread_sra > 40)
#
# FALSE TRUE
# 46427 3230
# > summary(nread_sra)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.04148 26.17371 35.12996 31.17124 38.04915 78.42058
nread_sra_noov <- colSums(assays(rse_scaled_sra)$counts[ov == 1, ]) / 1e6
table(nread_sra_noov > 40)
summary(nread_sra_noov)
# > table(nread_sra_noov > 40)
#
# FALSE TRUE
# 49642 15
# > summary(nread_sra_noov)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00401 15.68622 22.62987 19.95534 25.02280 56.10205
i <- which(nread_sra_noov > 40)
# > nread_sra_noov[i]
# SRR039611 SRR039612 SRR039614 SRR039615 SRR039616 SRR039617 SRR039618
# 50.54406 49.78613 51.65518 41.18782 51.31583 50.21475 47.56245
# SRR039619 SRR039620 SRR039621 SRR039624 SRR191578 SRR1044498 SRR1044598
# 56.10205 42.22048 51.93728 46.13629 41.90084 40.49189 49.54119
# SRR1044599
# 48.83405
options(width = 120)
colData(rse_scaled_sra)[i, c('sample', 'read_count_as_reported_by_sra', 'reads_downloaded', 'paired_end', 'mapped_read_count', 'auc')]
# > colData(rse_scaled_sra)[i, c('sample', 'read_count_as_reported_by_sra', 'reads_downloaded', 'paired_end', 'mapped_read_count', 'auc')]
# DataFrame with 15 rows and 6 columns
# sample read_count_as_reported_by_sra reads_downloaded paired_end mapped_read_count auc
# <character> <integer> <integer> <logical> <integer> <numeric>
# SRR039611 SRS059014 9277378 9277378 FALSE 8065356 179937063
# SRR039612 SRS059015 15656640 15656640 FALSE 12701639 283726804
# SRR039614 SRS059017 9431188 9431188 FALSE 8606523 191313032
# SRR039615 SRS059018 9288038 9288038 FALSE 7868109 173004663
# SRR039616 SRS059019 10206748 10206748 FALSE 6755063 150508814
# ... ... ... ... ... ... ...
# SRR039624 SRS059027 16427267 16427267 FALSE 13070019 302682401
# SRR191578 SRS190638 500705 500705 FALSE 385932 8663519
# SRR1044498 SRS512177 3127806 3127806 FALSE 1722674 39041365
# SRR1044598 SRS512276 1290214 1290214 FALSE 1167335 26490287
# SRR1044599 SRS512277 1789104 1789104 FALSE 1637205 37754577
colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100
summary(colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100)
# > colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100
# [1] 86.93573 81.12621 91.25598 84.71228 66.18232 69.58729 72.36412 67.26457 79.95167 82.68239 79.56295 77.07772
# [13] 55.07611 90.47608 91.50977
# > summary(colData(rse_scaled_sra)$mapped_read_count[i] / colData(rse_scaled_sra)$reads_downloaded[i] * 100)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 55.08 70.98 79.95 78.38 85.82 91.51
nread_sra_noov2 <- colSums(assays(rse_scaled_sra)$counts[ov_gene_by_exon == 1, ]) / 1e6
table(nread_sra_noov2 > 40)
summary(nread_sra_noov2)
summary(nread_sra_noov2[nread_sra_noov2 > 40])
# > table(nread_sra_noov2 > 40)
#
# FALSE TRUE
# 49188 469
# > summary(nread_sra_noov2)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.04019 20.59794 28.42783 25.46419 31.25098 78.40682
# > summary(nread_sra_noov2[nread_sra_noov2 > 40])
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 40.00 41.48 45.38 49.80 51.86 78.41
## Reproducibility information
print('Reproducibility information:')
Sys.time()
proc.time()
options(width = 120)
session_info()
# Session info ----------------------------------------------------------------------------------------------------------
# setting value
# version R version 3.5.0 Patched (2018-04-30 r74679)
# system x86_64, linux-gnu
# ui X11
# language (EN)
# collate en_US.UTF-8
# tz US/Eastern
# date 2018-09-10
#
# Packages --------------------------------------------------------------------------------------------------------------
# package * version date source
# acepack 1.4.1 2016-10-29 CRAN (R 3.5.0)
# AnnotationDbi * 1.42.1 2018-05-17 Bioconductor
# assertthat 0.2.0 2017-04-11 CRAN (R 3.5.0)
# backports 1.1.2 2017-12-13 CRAN (R 3.5.0)
# base * 3.5.0 2018-05-02 local
# base64enc 0.1-3 2015-07-28 CRAN (R 3.5.0)
# bibtex 0.4.2 2017-06-30 CRAN (R 3.5.0)
# bindr 0.1.1 2018-03-13 CRAN (R 3.5.0)
# bindrcpp 0.2.2 2018-03-29 CRAN (R 3.5.0)
# Biobase * 2.40.0 2018-05-02 Bioconductor
# BiocGenerics * 0.26.0 2018-05-03 Bioconductor
# BiocParallel * 1.14.2 2018-07-08 Bioconductor
# biomaRt 2.36.1 2018-06-13 Bioconductor
# Biostrings 2.48.0 2018-05-03 Bioconductor
# bit 1.1-14 2018-05-29 CRAN (R 3.5.0)
# bit64 0.9-7 2017-05-08 CRAN (R 3.5.0)
# bitops 1.0-6 2013-08-17 CRAN (R 3.5.0)
# blob 1.1.1 2018-03-25 CRAN (R 3.5.0)
# BSgenome 1.48.0 2018-05-03 Bioconductor
# bumphunter 1.22.0 2018-05-03 Bioconductor
# checkmate 1.8.5 2017-10-24 CRAN (R 3.5.0)
# cluster 2.0.7-1 2018-04-13 CRAN (R 3.5.0)
# codetools 0.2-15 2016-10-05 CRAN (R 3.5.0)
# colorout * 1.2-0 2018-05-02 Github (jalvesaq/colorout@c42088d)
# colorspace 1.3-2 2016-12-14 CRAN (R 3.5.0)
# compiler 3.5.0 2018-05-02 local
# crayon 1.3.4 2017-09-16 CRAN (R 3.5.0)
# data.table 1.11.4 2018-05-27 cran (@1.11.4)
# datasets * 3.5.0 2018-05-02 local
# DBI 1.0.0 2018-05-02 CRAN (R 3.5.0)
# DelayedArray * 0.6.2 2018-07-23 Bioconductor
# derfinder 1.14.0 2018-05-03 Bioconductor
# derfinderHelper 1.14.0 2018-05-03 Bioconductor
# devtools * 1.13.6 2018-06-27 CRAN (R 3.5.0)
# digest 0.6.15 2018-01-28 CRAN (R 3.5.0)
# doRNG 1.7.1 2018-06-22 CRAN (R 3.5.0)
# downloader 0.4 2015-07-09 CRAN (R 3.5.0)
# dplyr 0.7.6 2018-06-29 CRAN (R 3.5.0)
# foreach 1.4.4 2017-12-12 CRAN (R 3.5.0)
# foreign 0.8-70 2017-11-28 CRAN (R 3.5.0)
# Formula 1.2-3 2018-05-03 CRAN (R 3.5.0)
# GenomeInfoDb * 1.16.0 2018-05-03 Bioconductor
# GenomeInfoDbData 1.1.0 2018-04-17 Bioconductor
# GenomicAlignments 1.16.0 2018-05-03 Bioconductor
# GenomicFeatures * 1.32.0 2018-05-03 Bioconductor
# GenomicFiles 1.16.0 2018-05-03 Bioconductor
# GenomicRanges * 1.32.6 2018-07-20 Bioconductor
# GEOquery 2.48.0 2018-05-02 Bioconductor
# ggplot2 3.0.0 2018-07-03 CRAN (R 3.5.0)
# glue 1.3.0 2018-07-17 CRAN (R 3.5.0)
# graphics * 3.5.0 2018-05-02 local
# grDevices * 3.5.0 2018-05-02 local
# grid 3.5.0 2018-05-02 local
# gridExtra 2.3 2017-09-09 CRAN (R 3.5.0)
# gtable 0.2.0 2016-02-26 CRAN (R 3.5.0)
# Hmisc 4.1-1 2018-01-03 CRAN (R 3.5.0)
# hms 0.4.2 2018-03-10 CRAN (R 3.5.0)
# htmlTable 1.12 2018-05-26 CRAN (R 3.5.0)
# htmltools 0.3.6 2017-04-28 CRAN (R 3.5.0)
# htmlwidgets 1.2 2018-04-19 CRAN (R 3.5.0)
# httpuv 1.4.5 2018-07-19 CRAN (R 3.5.0)
# httr 1.3.1 2017-08-20 CRAN (R 3.5.0)
# IRanges * 2.14.10 2018-05-17 Bioconductor
# iterators 1.0.10 2018-07-13 CRAN (R 3.5.0)
# jsonlite 1.5 2017-06-01 CRAN (R 3.5.0)
# knitr 1.20 2018-02-20 CRAN (R 3.5.0)
# later 0.7.3 2018-06-08 CRAN (R 3.5.0)
# lattice 0.20-35 2017-03-25 CRAN (R 3.5.0)
# latticeExtra 0.6-28 2016-02-09 CRAN (R 3.5.0)
# lazyeval 0.2.1 2017-10-29 CRAN (R 3.5.0)
# limma 3.36.2 2018-06-21 Bioconductor
# locfit 1.5-9.1 2013-04-20 CRAN (R 3.5.0)
# magrittr 1.5 2014-11-22 CRAN (R 3.5.0)
# Matrix 1.2-14 2018-04-13 CRAN (R 3.5.0)
# matrixStats * 0.54.0 2018-07-23 CRAN (R 3.5.0)
# memoise 1.1.0 2017-04-21 CRAN (R 3.5.0)
# methods * 3.5.0 2018-05-02 local
# munsell 0.5.0 2018-06-12 CRAN (R 3.5.0)
# nnet 7.3-12 2016-02-02 CRAN (R 3.5.0)
# org.Hs.eg.db * 3.6.0 2018-05-03 Bioconductor
# parallel * 3.5.0 2018-05-02 local
# pillar 1.3.0 2018-07-14 CRAN (R 3.5.0)
# pkgconfig 2.0.1 2017-03-21 CRAN (R 3.5.0)
# pkgmaker 0.27 2018-05-25 CRAN (R 3.5.0)
# plyr 1.8.4 2016-06-08 CRAN (R 3.5.0)
# png 0.1-7 2013-12-03 CRAN (R 3.5.0)
# prettyunits 1.0.2 2015-07-13 CRAN (R 3.5.0)
# progress 1.2.0 2018-06-14 CRAN (R 3.5.0)
# promises 1.0.1 2018-04-13 CRAN (R 3.5.0)
# purrr 0.2.5 2018-05-29 CRAN (R 3.5.0)
# qvalue 2.12.0 2018-05-03 Bioconductor
# R6 2.2.2 2017-06-17 CRAN (R 3.5.0)
# RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.5.0)
# Rcpp 0.12.18 2018-07-23 CRAN (R 3.5.0)
# RCurl 1.95-4.11 2018-07-15 CRAN (R 3.5.0)
# readr 1.1.1 2017-05-16 CRAN (R 3.5.0)
# recount * 1.6.2 2018-05-17 Bioconductor
# registry 0.5 2017-12-03 CRAN (R 3.5.0)
# rentrez 1.2.1 2018-03-05 CRAN (R 3.5.0)
# reshape2 1.4.3 2017-12-11 CRAN (R 3.5.0)
# rlang 0.2.1 2018-05-30 cran (@0.2.1)
# rmote * 0.3.4 2018-05-02 deltarho (R 3.5.0)
# rngtools 1.3.1 2018-05-15 CRAN (R 3.5.0)
# rpart 4.1-13 2018-02-23 CRAN (R 3.5.0)
# Rsamtools 1.32.2 2018-07-03 Bioconductor
# RSQLite 2.1.1 2018-05-06 CRAN (R 3.5.0)
# rstudioapi 0.7 2017-09-07 CRAN (R 3.5.0)
# rtracklayer 1.40.3 2018-06-13 Bioconductor
# S4Vectors * 0.18.3 2018-06-13 Bioconductor
# scales 0.5.0 2017-08-24 CRAN (R 3.5.0)
# servr 0.10 2018-05-30 CRAN (R 3.5.0)
# splines 3.5.0 2018-05-02 local
# stats * 3.5.0 2018-05-02 local
# stats4 * 3.5.0 2018-05-02 local
# stringi 1.2.4 2018-07-20 CRAN (R 3.5.0)
# stringr 1.3.1 2018-05-10 CRAN (R 3.5.0)
# SummarizedExperiment * 1.10.1 2018-05-17 Bioconductor
# survival 2.42-3 2018-04-16 CRAN (R 3.5.0)
# tibble 1.4.2 2018-01-22 CRAN (R 3.5.0)
# tidyr 0.8.1 2018-05-18 CRAN (R 3.5.0)
# tidyselect 0.2.4 2018-02-26 CRAN (R 3.5.0)
# tools 3.5.0 2018-05-02 local
# utils * 3.5.0 2018-05-02 local
# VariantAnnotation 1.26.1 2018-07-04 Bioconductor
# withr 2.1.2 2018-03-15 CRAN (R 3.5.0)
# xfun 0.3 2018-07-06 CRAN (R 3.5.0)
# XML 3.98-1.12 2018-07-15 CRAN (R 3.5.0)
# xml2 1.2.0 2018-01-24 CRAN (R 3.5.0)
# xtable 1.8-2 2016-02-05 CRAN (R 3.5.0)
# XVector 0.20.0 2018-05-03 Bioconductor
# zlibbioc 1.26.0 2018-05-02 Bioconductor
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment