Last active
August 29, 2015 14:15
-
-
Save lcolladotor/76f2efdd5d5108772696 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> library('GenomicRanges') | |
Loading required package: BiocGenerics | |
Loading required package: parallel | |
Attaching package: ‘BiocGenerics’ | |
The following objects are masked from ‘package:parallel’: | |
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, | |
clusterExport, clusterMap, parApply, parCapply, parLapply, | |
parLapplyLB, parRapply, parSapply, parSapplyLB | |
The following object is masked from ‘package:stats’: | |
xtabs | |
The following objects are masked from ‘package:base’: | |
anyDuplicated, append, as.data.frame, as.vector, cbind, colnames, | |
do.call, duplicated, eval, evalq, Filter, Find, get, intersect, | |
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, | |
pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int, | |
rownames, sapply, setdiff, sort, table, tapply, union, unique, | |
unlist, unsplit | |
Loading required package: S4Vectors | |
Loading required package: stats4 | |
Loading required package: IRanges | |
Loading required package: GenomeInfoDb | |
> library('GenomicAlignments') | |
Loading required package: Biostrings | |
Loading required package: XVector | |
Loading required package: Rsamtools | |
> | |
> ## Basically http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-6/NA06985_accepted_hits.bam | |
> ## You have to create the bam.bai file | |
> bam <- BamFile('geuvadis/tophat/G/NA06985/accepted_hits.bam') | |
> | |
> ## Info from a given gene (CD83 from hg19) | |
> gr <- GRanges(seqnames = 'chr6', IRanges(start = c(14117865, 14118181, 14131751, 14133880, 14135342, 14117487, 14118181, 14131751, 14133880, 14135339, 14117865, 14118181, 14131751, 14133880, 14135339), end = c(14118079, 14118296, 14131979, 14133986, 14137148, 14117530, 14118296, 14131979, 14133986, 14137148, 14118079, 14118296, 14131979, 14133986, 14137148)), strand = '+') | |
> | |
> ## Read alignments | |
> aln <- readGAlignments(bam, param = ScanBamParam(which = gr, what = 'qname')) | |
> | |
> ## Should be getting similar if I used the range, no? | |
> range(gr) | |
GRanges object with 1 range and 0 metadata columns: | |
seqnames ranges strand | |
<Rle> <IRanges> <Rle> | |
[1] chr6 [14117487, 14137148] + | |
------- | |
seqinfo: 1 sequence from an unspecified genome; no seqlengths | |
> aln_range <- readGAlignments(bam, param = ScanBamParam(which = range(gr), what = 'qname')) | |
> | |
> ## Or even a bit more if I add some padding for reads that map a long distance away | |
> aln_range_resize <- readGAlignments(bam, param = ScanBamParam(which = resize(range(gr), width = width(range(gr)) + 2e6, fix = 'center'), what = 'qname')) | |
> | |
> length(aln) | |
[1] 25965 | |
> length(aln_range) | |
[1] 8135 | |
> ## Padding helps, but why was it not necessary with the individual ranges? | |
> length(aln_range_resize) | |
[1] 24399 | |
> | |
> ## Maximum coverage changes | |
> max(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 1557 | |
> max(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 434 | |
> ## Max is the same with padding | |
> max(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 434 | |
> | |
> ## Mean? | |
> mean(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 99.03367 | |
> mean(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 31.02899 | |
> ## So is the mean.. hm.. | |
> mean(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))]) | |
[1] 31.02899 | |
> | |
> ## QNAME? | |
> max(table(mcols(aln)$qname)) | |
[1] 12 | |
> max(table(mcols(aln_range)$qname)) | |
[1] 2 | |
> max(table(mcols(aln_range_resize)$qname)) | |
[1] 10 | |
> | |
> ## One range at a time | |
> one <- lapply(gr, function(x) { | |
+ coverage(readGAlignments(bam, param = ScanBamParam(which = x)))[['chr6']][start(range(gr)):end(range(gr))] | |
+ }) | |
> ## Compare between coverage one at a time versus using which = gr | |
> Reduce('+', one) - coverage(aln)[['chr6']][start(range(gr)):end(range(gr))] | |
integer-Rle of length 19662 with 1 run | |
Lengths: 19662 | |
Values : 0 | |
> | |
> ## Aha! Reads are being duplicated! | |
> | |
> ## It's because some ranges overlap | |
> countOverlaps(gr) | |
[1] 2 3 3 3 3 1 3 3 3 3 2 3 3 3 3 | |
> | |
> devtools::session_info() | |
Session info ------------------------------------------------------------------- | |
setting value | |
version R version 3.1.1 Patched (2014-10-16 r66782) | |
system x86_64, linux-gnu | |
ui X11 | |
language (EN) | |
collate en_US.UTF-8 | |
tz <NA> | |
Packages ----------------------------------------------------------------------- | |
package * version date source | |
base64enc * 0.1-2 2014-06-26 CRAN (R 3.1.0) | |
BatchJobs * 1.5 2014-10-30 CRAN (R 3.1.1) | |
BBmisc * 1.9 2015-02-03 CRAN (R 3.1.1) | |
BiocGenerics 0.12.1 2014-11-15 Bioconductor | |
BiocParallel * 1.0.3 2015-02-09 Bioconductor | |
Biostrings 2.34.1 2014-12-13 Bioconductor | |
bitops * 1.0-6 2013-08-17 CRAN (R 3.1.0) | |
brew * 1.0-6 2011-04-13 CRAN (R 3.1.0) | |
checkmate * 1.5.1 2014-12-14 CRAN (R 3.1.1) | |
codetools * 0.2-9 2014-08-21 CRAN (R 3.1.1) | |
colorout 1.0-2 2014-11-04 local | |
DBI * 0.3.1 2014-09-24 CRAN (R 3.1.1) | |
devtools * 1.7.0 2015-01-17 CRAN (R 3.1.1) | |
digest * 0.6.8 2014-12-31 CRAN (R 3.1.1) | |
fail * 1.2 2013-09-19 CRAN (R 3.1.0) | |
foreach * 1.4.2 2014-04-11 CRAN (R 3.1.0) | |
GenomeInfoDb 1.2.4 2014-12-20 Bioconductor | |
GenomicAlignments 1.2.1 2014-11-05 Bioconductor | |
GenomicRanges 1.18.4 2015-01-08 Bioconductor | |
IRanges 2.0.1 2014-12-13 Bioconductor | |
iterators * 1.0.7 2014-04-11 CRAN (R 3.1.0) | |
Rsamtools 1.18.2 2014-11-12 Bioconductor | |
RSQLite * 1.0.0 2014-10-25 CRAN (R 3.1.1) | |
rstudioapi * 0.2 2014-12-31 CRAN (R 3.1.1) | |
S4Vectors 0.4.0 2014-10-15 Bioconductor | |
sendmailR * 1.2-1 2014-09-21 CRAN (R 3.1.1) | |
stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0) | |
XVector 0.6.0 2014-10-15 Bioconductor | |
zlibbioc * 1.12.0 2014-10-15 Bioconductor | |
> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('GenomicRanges') | |
library('GenomicAlignments') | |
## Basically http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-6/NA06985_accepted_hits.bam | |
## You have to create the bam.bai file | |
bam <- BamFile('geuvadis/tophat/G/NA06985/accepted_hits.bam') | |
## Info from a given gene (CD83 from hg19) | |
gr <- GRanges(seqnames = 'chr6', IRanges(start = c(14117865, 14118181, 14131751, 14133880, 14135342, 14117487, 14118181, 14131751, 14133880, 14135339, 14117865, 14118181, 14131751, 14133880, 14135339), end = c(14118079, 14118296, 14131979, 14133986, 14137148, 14117530, 14118296, 14131979, 14133986, 14137148, 14118079, 14118296, 14131979, 14133986, 14137148)), strand = '+') | |
## Read alignments | |
aln <- readGAlignments(bam, param = ScanBamParam(which = gr, what = 'qname')) | |
## Should be getting similar if I used the range, no? | |
range(gr) | |
aln_range <- readGAlignments(bam, param = ScanBamParam(which = range(gr), what = 'qname')) | |
## Or even a bit more if I add some padding for reads that map a long distance away | |
aln_range_resize <- readGAlignments(bam, param = ScanBamParam(which = resize(range(gr), width = width(range(gr)) + 2e6, fix = 'center'), what = 'qname')) | |
length(aln) | |
length(aln_range) | |
## Padding helps, but why was it not necessary with the individual ranges? | |
length(aln_range_resize) | |
## Maximum coverage changes | |
max(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]) | |
max(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))]) | |
## Max is the same with padding | |
max(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))]) | |
## Mean? | |
mean(coverage(aln)[['chr6']][start(range(gr)):end(range(gr))]) | |
mean(coverage(aln_range)[['chr6']][start(range(gr)):end(range(gr))]) | |
## So is the mean.. hm.. | |
mean(coverage(aln_range_resize)[['chr6']][start(range(gr)):end(range(gr))]) | |
## QNAME? | |
max(table(mcols(aln)$qname)) | |
max(table(mcols(aln_range)$qname)) | |
max(table(mcols(aln_range_resize)$qname)) | |
## One range at a time | |
one <- lapply(gr, function(x) { | |
coverage(readGAlignments(bam, param = ScanBamParam(which = x)))[['chr6']][start(range(gr)):end(range(gr))] | |
}) | |
## Compare between coverage one at a time versus using which = gr | |
Reduce('+', one) - coverage(aln)[['chr6']][start(range(gr)):end(range(gr))] | |
## Aha! Reads are being duplicated! | |
## It's because some ranges overlap | |
countOverlaps(gr) | |
devtools::session_info() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment