Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Represent aligned DNA sequences as a DNAStringSet based on position and CIGAR
library(Rsamtools)
.cigarAlignInput <-
function(file, param, what)
{
result <- readBamGappedAlignments(file, param=param)
names(mcols(result))[names(mcols(result)) == what] <- "what"
result
}
.cigarAlignGrouped <-
function(x, pad)
{
if (length(pad)) {
start <- start(pad)
width <- end(pad) - start + 1L
} else {
start <- min(start(x))
width <- max(end(x)) - start + 1L
}
pos <- start(x) - start + 1L
## group into disjoint bins
caln <- cigarToIRangesListByAlignment(cigar(x), pos,
drop.D.ranges=TRUE,
drop.empty.ranges=TRUE)
caln <- restrict(caln, 1L, width, keep.all.ranges=TRUE)
lvl0 <- disjointBins(unlist(range(caln)))
lvl <- factor(lvl0, levels=seq_along(unique(lvl0)))
## views on unlist(x$what), what=="seq", "qual"
vpos <- cumsum(width(mcols(x)$what)) - (width(mcols(x)$what)[1] - 1L)
valn <- cigarToIRangesListByAlignment(cigar(x), vpos,
drop.D.ranges=TRUE,
drop.empty.ranges=TRUE)
## gap sequences
galn <- cigarToIRangesListByRName(cigar(x), lvl, pos,
drop.D.ranges=TRUE,
drop.empty.ranges=TRUE)
gaps <- gaps(galn, start=1L, end=width)
## group valn and galn
alllvl <- factor(c(rep(lvl, elementLengths(caln)),
rep(levels(lvl), elementLengths(gaps))))
allpos <- unname(c(unlist(start(caln)), unlist(start(gaps))))
o <- order(alllvl, allpos)
dnaLength <- sum(width(mcols(x)$what))
ugaps <- unlist(gaps)
nDash <- max(width(ugaps))
vseq <- c(unlist(valn), shift(ugaps, dnaLength - start(ugaps) + 1L))[o]
if (is(mcols(x)$what, "DNAStringSet")) {
WhatString <- DNAString
WhatStringSet <- DNAStringSet
} else {
WhatString <- BString
WhatStringSet <- BStringSet
}
dna <- c(unlist(mcols(x)$what),
WhatString(paste(rep("-", nDash), collapse="")))
dnaString <- unlist(WhatStringSet(Views(dna, vseq)))
rng <- successiveIRanges(rep(length(dnaString) / nlevels(lvl), nlevels(lvl)))
v <- Views(dnaString, rng)
metadata(v) <- list(start=start)
v
}
cigarAlign <-
function(file, param, ..., pad=FALSE)
{
what0 <- c("qual", "seq")
idx <- what0 %in% bamWhat(param)
if (sum(idx) != 1L)
stop("'what' must contain exactly one of ",
paste(sQuote(what0), collapse=", "))
what <- what0[idx]
len <-elementLengths(bamWhich(param))
if (sum(len != 0L) != 1L)
stop("'which' must have exactly 1 non-zero range")
x <- .cigarAlignInput(file, param, what)
if (pad)
pad <- unlist(bamWhich(param))
else pad <- IRanges()
.cigarAlignGrouped(x, pad)
}
## fls <- dir(file.path(wd, "bam"), "bam$", full=TRUE)
## bfl <- BamFileList(setNames(fls, sub(".bam$", "", basename(fls))))
## which <- GRanges("5", IRanges(1248187, 1300264))
## param <- ScanBamParam(what="seq", which=which)
## aln <- cigarAlign(bfl[[1]], param=param, pad=TRUE)
library(RColorBrewer)
.dna_variant_pal <- brewer.pal(4, "Paired")[c(1, 3, 4, 2)]
variantplot1 <-
function(aln, ...)
{
## dot plot of nucleotide frequencies
m <- consensusMatrix(aln)[1:4,]
matplot(t(m) / colSums(m), pch=".",
col=.dna_variant_pal, ...)
}
## variantplot1(aln, cex=2)
variantplot2 <-
function(aln)
{
## pileup
alf <- alphabet(aln)
map <- setNames(seq_along(alf), alf)
m <- matrix(map[unlist(strsplit(as.character(aln), ""))],
nrow=length(aln), byrow=TRUE)
m[m > 4] <- NA
n <- rowSums(!is.na(m))
o <- order(n, decreasing=TRUE)
o <- o[n[o] != 0L]
image(t(m[o,]), col=.dna_variant_pal)
}
## variantplot2(subviews(aln, 1, 1000))
variantplot3 <-
function(aln)
{
## pileup, same order as variantplot2
alf <- alphabet(aln)
map <- setNames(seq_along(alf), alf)
m <- matrix(map[unlist(strsplit(as.character(aln), ""))],
nrow=length(aln), byrow=TRUE)
m[m > 4] <- NA
n <- rowSums(!is.na(m))
o <- order(n, decreasing=TRUE)
o <- o[n[o] != 0]
x <- consensusMatrix(aln)[1:4,]
consensus <- apply(x, 2, which.max)
m[t(t(m) == consensus)] <- NA
image(t(m[o,]), col=.dna_variant_pal)
}
## variantplot3(subviews(aln, 1, 1000))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.