Skip to content

Instantly share code, notes, and snippets.

@lcolladotor
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lcolladotor/76f2efdd5d5108772696 to your computer and use it in GitHub Desktop.
Save lcolladotor/76f2efdd5d5108772696 to your computer and use it in GitHub Desktop.
> library('GenomicRanges')
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following object is masked from ‘package:stats’:
xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
do.call, duplicated, eval, evalq, Filter, Find, get, intersect,
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
rownames, sapply, setdiff, sort, table, tapply, union, unique,
unlist, unsplit
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: IRanges
Loading required package: GenomeInfoDb
> library('GenomicAlignments')
Loading required package: Biostrings
Loading required package: XVector
Loading required package: Rsamtools
>
> ## Basically http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-6/NA06985_accepted_hits.bam
> ## You have to create the bam.bai file
> bam <- BamFile('geuvadis/tophat/G/NA06985/accepted_hits.bam')
>
> ## Info from a given gene (CD83 from hg19)
> gr <- GRanges(seqnames = 'chr6', IRanges(start = c(14117865, 14118181, 14131751, 14133880, 14135342, 14117487, 14118181, 14131751, 14133880, 14135339, 14117865, 14118181, 14131751, 14133880, 14135339), end = c(14118079, 14118296, 14131979, 14133986, 14137148, 14117530, 14118296, 14131979, 14133986, 14137148, 14118079, 14118296, 14131979, 14133986, 14137148)), strand = '+')
>
> ## Read alignments
> aln <- readGAlignments(bam, param = ScanBamParam(which = gr, what = 'qname'))
>
> ## Should be getting similar if I used the range, no?
> range(gr)
GRanges object with 1 range and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr6 [14117487, 14137148] +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
> aln_range <- readGAlignments(bam, param = ScanBamParam(which = range(gr), what = 'qname'))
>
> ## Or even a bit more if I add some padding for reads that map a long distance away
> aln_range_resize <- readGAlignments(bam, param = ScanBamParam(which = resize(range(gr), width = width(range(gr)) + 2e6, fix = 'center'), what = 'qname'))
>
> length(aln)
[1] 25965
> length(aln_range)
[1] 8135
> ## Padding helps, but why was it not necessary with the individual ranges?
> length(aln_range_resize)
[1] 24399
>
> ## Maximum coverage changes
> max(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))])
[1] 1557
> max(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))])
[1] 434
> ## Max is the same with padding
> max(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))])
[1] 434
>
> ## Mean?
> mean(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))])
[1] 99.03367
> mean(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))])
[1] 31.02899
> ## So is the mean.. hm..
> mean(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))])
[1] 31.02899
>
> ## QNAME?
> max(table(mcols(aln)$qname))
[1] 12
> max(table(mcols(aln_range)$qname))
[1] 2
> max(table(mcols(aln_range_resize)$qname))
[1] 10
>
> ## One range at a time
> one <- lapply(gr, function(x) {
+ coverage(readGAlignments(bam, param = ScanBamParam(which = x)))[['chr6']][start(range(gr)):end(range(gr))]
+ })
> ## Compare between coverage one at a time versus using which = gr
> Reduce('+', one) - coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]
integer-Rle of length 19662 with 1 run
Lengths: 19662
Values : 0
>
> ## Aha! Reads are being duplicated!
>
> ## It's because some ranges overlap
> countOverlaps(gr)
[1] 2 3 3 3 3 1 3 3 3 3 2 3 3 3 3
>
> devtools::session_info()
Session info -------------------------------------------------------------------
setting value
version R version 3.1.1 Patched (2014-10-16 r66782)
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
tz <NA>
Packages -----------------------------------------------------------------------
package * version date source
base64enc * 0.1-2 2014-06-26 CRAN (R 3.1.0)
BatchJobs * 1.5 2014-10-30 CRAN (R 3.1.1)
BBmisc * 1.9 2015-02-03 CRAN (R 3.1.1)
BiocGenerics 0.12.1 2014-11-15 Bioconductor
BiocParallel * 1.0.3 2015-02-09 Bioconductor
Biostrings 2.34.1 2014-12-13 Bioconductor
bitops * 1.0-6 2013-08-17 CRAN (R 3.1.0)
brew * 1.0-6 2011-04-13 CRAN (R 3.1.0)
checkmate * 1.5.1 2014-12-14 CRAN (R 3.1.1)
codetools * 0.2-9 2014-08-21 CRAN (R 3.1.1)
colorout 1.0-2 2014-11-04 local
DBI * 0.3.1 2014-09-24 CRAN (R 3.1.1)
devtools * 1.7.0 2015-01-17 CRAN (R 3.1.1)
digest * 0.6.8 2014-12-31 CRAN (R 3.1.1)
fail * 1.2 2013-09-19 CRAN (R 3.1.0)
foreach * 1.4.2 2014-04-11 CRAN (R 3.1.0)
GenomeInfoDb 1.2.4 2014-12-20 Bioconductor
GenomicAlignments 1.2.1 2014-11-05 Bioconductor
GenomicRanges 1.18.4 2015-01-08 Bioconductor
IRanges 2.0.1 2014-12-13 Bioconductor
iterators * 1.0.7 2014-04-11 CRAN (R 3.1.0)
Rsamtools 1.18.2 2014-11-12 Bioconductor
RSQLite * 1.0.0 2014-10-25 CRAN (R 3.1.1)
rstudioapi * 0.2 2014-12-31 CRAN (R 3.1.1)
S4Vectors 0.4.0 2014-10-15 Bioconductor
sendmailR * 1.2-1 2014-09-21 CRAN (R 3.1.1)
stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0)
XVector 0.6.0 2014-10-15 Bioconductor
zlibbioc * 1.12.0 2014-10-15 Bioconductor
>
library('GenomicRanges')
library('GenomicAlignments')
## Basically http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-6/NA06985_accepted_hits.bam
## You have to create the bam.bai file
bam <- BamFile('geuvadis/tophat/G/NA06985/accepted_hits.bam')
## Info from a given gene (CD83 from hg19)
gr <- GRanges(seqnames = 'chr6', IRanges(start = c(14117865, 14118181, 14131751, 14133880, 14135342, 14117487, 14118181, 14131751, 14133880, 14135339, 14117865, 14118181, 14131751, 14133880, 14135339), end = c(14118079, 14118296, 14131979, 14133986, 14137148, 14117530, 14118296, 14131979, 14133986, 14137148, 14118079, 14118296, 14131979, 14133986, 14137148)), strand = '+')
## Read alignments
aln <- readGAlignments(bam, param = ScanBamParam(which = gr, what = 'qname'))
## Should be getting similar if I used the range, no?
range(gr)
aln_range <- readGAlignments(bam, param = ScanBamParam(which = range(gr), what = 'qname'))
## Or even a bit more if I add some padding for reads that map a long distance away
aln_range_resize <- readGAlignments(bam, param = ScanBamParam(which = resize(range(gr), width = width(range(gr)) + 2e6, fix = 'center'), what = 'qname'))
length(aln)
length(aln_range)
## Padding helps, but why was it not necessary with the individual ranges?
length(aln_range_resize)
## Maximum coverage changes
max(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))])
max(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))])
## Max is the same with padding
max(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))])
## Mean?
mean(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))])
mean(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))])
## So is the mean.. hm..
mean(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))])
## QNAME?
max(table(mcols(aln)$qname))
max(table(mcols(aln_range)$qname))
max(table(mcols(aln_range_resize)$qname))
## One range at a time
one <- lapply(gr, function(x) {
coverage(readGAlignments(bam, param = ScanBamParam(which = x)))[['chr6']][start(range(gr)):end(range(gr))]
})
## Compare between coverage one at a time versus using which = gr
Reduce('+', one) - coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]
## Aha! Reads are being duplicated!
## It's because some ranges overlap
countOverlaps(gr)
devtools::session_info()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment