Skip to content

Instantly share code, notes, and snippets.

@mtmorgan
Created March 15, 2016 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mtmorgan/f752e1bb7da25a4753d7 to your computer and use it in GitHub Desktop.
Save mtmorgan/f752e1bb7da25a4753d7 to your computer and use it in GitHub Desktop.
Translate integer or ASCII character fastq phred score encodings to integer offsets useful in Rsamtools
.ascii_offset <- function()
setNames(33:126 - 33L, strsplit(rawToChar(as.raw(33:126)), "")[[1]])
.phred2ascii_int <-
function(x, scheme)
{
## See https://en.wikipedia.org/wiki/FASTQ_format#Encoding
ascii <- .ascii_offset()
switch(scheme, "Illumina 1.8+" = {
## L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
stopifnot(all(x >= 0), all(x <= 41))
ascii[x + 1L]
}, "Sanger" = {
## S - Sanger Phred+33, raw reads typically (0, 40)
stopifnot(all(x >= 0), all(x <= 40))
ascii[x + 1L]
}, "Solexa" = {
## X - Solexa Solexa+64, raw reads typically (-5, 40)
stopifnot(all(x >= -5), all(x <= 40))
ascii[x + 32L]
}, "Illumina 1.3+" = {
## I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
stopifnot(all(x >= 0), all(x <= 40))
ascii[x + 32L]
}, "Illumina 1.5+" = {
## J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
## with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
## (Note: See discussion above).
stopifnot(all(x >= 3), all(x <= 40))
ascii[x + 32L]
}, default = stop("unknown scheme '", scheme, "'"))
}
.phred2ascii_char <-
function(x)
{
ascii <- .ascii_offset()
stopifnot(all(x %in% names(ascii)))
ascii[x]
}
phred2ASCIIOffset <-
function(phred=integer(),scheme= c("Illumina 1.8+", "Sanger", "Solexa",
"Illumina 1.3+", "Illumina 1.5+"))
{
if (is.numeric(phred)) {
stopifnot(missing(scheme) || (length(scheme) == 1L), !anyNA(phred))
scheme <- match.arg(scheme)
phred <- as.integer(phred)
.phred2ascii_int(phred, scheme)
} else if (is.character(phred)) {
if (!missing(scheme))
message("'scheme' ignored; does not influence ASCII offset")
if (length(phred) == 1L && nchar(phred) > 1L)
phred <- strsplit(phred, "")[[1]]
stopifnot(all(nchar(phred) == 1L))
.phred2ascii_char(phred)
} else
stop("'phred' must be numeric (coerced to integer) or character")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment