Skip to content

Instantly share code, notes, and snippets.

@lcolladotor
Created October 22, 2014 03:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lcolladotor/3e6f0e2e2bb67ee910da to your computer and use it in GitHub Desktop.
Save lcolladotor/3e6f0e2e2bb67ee910da to your computer and use it in GitHub Desktop.
#' Change naming style for a set of sequence names
#'
#' If available, use the information from GenomeInfoDb for your species of
#' interest to map the sequence names from the style currently used to another
#' valid style. For example, for Homo sapiens map '2' (NCBI style) to 'chr2'
#' (UCSC style). If the information from GenomeInfoDb is not available, the
#' original sequence names will be returned.
#'
#' @param seqnames A character vector with the sequence names.
#' @param style A single character vector specifying the naming style to use
#' for renaming the sequence names.
#' @param species A single character vector with the species of interest: it has
#' to match the valid species names supported in GenomeInfoDb. See
#' \code{names(GenomeInfoDb::genomeStyles())}. If \code{species = NULL},
#' a guess will be made using the available information in GenomeInfoDb.
#' @param currentStyle A single character vector with the currently used
#' naming style. If \code{NULL}, a guess will be made from the naming styles
#' supported by \code{species}.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#'
#' @return A vector of sequence names using the specified naming \code{style}.
#'
#' @author L. Collado-Torres
#' @export
#' @aliases extended_map_seqlevels
#'
#' @details This function is inspired from \link[GenomeInfoDb]{mapSeqlevels}
#' with the difference that it will return the original sequence names if
#' the species, current naming style or target naming style are not supported
#' in GenomeInfoDb.
#'
#' @examples
#'
#'
#' ## Without guessing any information
#' extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', 'NCBI')
#'
#' ## Guessing the current naming style
#' extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens')
#'
#' ## Guess species and current style
#' extendedMapSeqlevels('2', 'NCBI')
#'
#' ## Guess species while supplying the current style.
#' ## Probably an uncommon use-case
#' extendedMapSeqlevels('2', 'NCBI', currentStyle = 'TAIR10')
#'
#' ## Sequence names are unchanged when using an unsupported species
#' extendedMapSeqlevels('seq2', 'NCBI', 'toyOrganism')
#'
#' \dontrun{
#' ## Set global species and style option
#' options('chrsStyle' = 'UCSC')
#' options('species' = 'homo_sapiens')
#'
#' ## Run using global options
#' extendedMapSeqlevels('2')
#' }
extendedMapSeqlevels <- function(seqnames, style = getOption('chrsStyle',
'UCSC'), species = getOption('species', 'homo_sapiens'),
currentStyle = NULL, verbose = getOption('verbose', TRUE)) {
if(is.null(style)) {
## If style is NULL return seqnames un-changed
return(seqnames)
}
## Check inputs
if(!is.character(seqnames))
seqnames <- as.character(seqnames)
stopifnot(is.character(style) & length(style) == 1)
if(!is.null(species)) stopifnot(is.character(species) & length(species) == 1)
if(!is.null(currentStyle)) stopifnot(is.character(currentStyle) & length(currentStyle) == 1)
guessed <- is.null(species) | is.null(currentStyle)
## Guess species
if(is.null(species)) {
species <- GenomeInfoDb:::.guessSpeciesStyle(seqnames)[1]
if(is.na(species)) {
if(verbose)
message("extendedMapSeqlevels: the 'seqnames' you supplied are currently not supported in GenomeInfoDb. Consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf")
return(seqnames)
}
}
## Format species name
species <- tolower(gsub(" ", "_", species))
## Extract valid mappings
supported <- GenomeInfoDb:::.supportedSeqnameMappings()
## Check species is supported
i <- which(tolower(names(supported)) == species)
if(length(i) != 1){
if(verbose)
message(paste("extendedMapSeqlevels: the 'species'", species, "is currently not supported in GenomeInfoDb. Check valid 'species' by running names(GenomeInfoDb::genomeStyles()). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
return(seqnames)
}
## Valid mapping for the species
mapping <- supported[[i]]
## Check style is supported
j <- which(tolower(colnames(mapping)) == tolower(style))
if(length(j) != 1) {
if(verbose)
message(paste("extendedMapSeqlevels: the 'style'", style, "is currently not supported for the 'species'", species, "in GenomeInfoDb. Check valid naming styles by running GenomeInfoDb::genomeStyles(species). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
return(seqnames)
}
## Guess current style for the current species
if(is.null(currentStyle)) {
if(ncol(mapping) < 2) {
stop("extendedMapSeqlevels: the mapping for your specified 'species' is incomplete: there should be at least 2 different styles.")
}
## Proceed as if using mapSeqlevels(best.only = TRUE)
possible <- sapply(mapping, function(styleNames) {
sum(tolower(seqnames) %in% tolower(styleNames)) })
if(max(possible) == 0) {
if(verbose)
message("extendedMapSeqlevels: the current naming style could not be guessed. Consider supplying 'currentStyle'.")
return(seqnames)
} else if (max(possible) < length(seqnames)) {
warning("extendedMapSeqlevels: not all the 'seqnames' match the best current naming style guess.")
}
currentStyle <- colnames(mapping)[ which.max(possible) ]
}
## Find current naming map
k <- which(tolower(colnames(mapping)) == tolower(currentStyle))
## Check currentStyle if it was supplied
if(!is.null(currentStyle)) {
if(length(k) != 1) {
if(verbose)
message(paste("extendedMapSeqlevels: the 'currentStyle'", currentStyle, "is currently not supported for the 'species'", species, "in GenomeInfoDb. Check valid naming styles by running GenomeInfoDb::genomeStyles(species). If it's not present, consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
return(seqnames)
}
}
## Is the target style the same as the current style?
if(tolower(style) == tolower(currentStyle)) {
return(seqnames)
}
## Make the map
seqnames <- mapping[match(seqnames, mapping[, k]), j]
if(verbose & guessed)
message(paste('extendedMapSeqlevels: sequence names mapped from', currentStyle, 'to', style, 'for species', species))
return(seqnames)
}
#' @export
extended_map_seqlevels <- extendedMapSeqlevels
## Tests with testthat
library('testthat')
test_that('Mapping levels', {
expect_that(extendedMapSeqlevels('seq2', 'UCSC', verbose = FALSE), equals('seq2'))
expect_that(extendedMapSeqlevels('seq2', 'UCSC', NULL, verbose = TRUE), shows_message("extendedMapSeqlevels: the 'seqnames' you supplied are currently not supported in GenomeInfoDb. Consider adding your genome by following the information at http://www.bioconductor.org/packages/release/bioc/vignettes/GenomeInfoDb/inst/doc/Accept-organism-for-GenomeInfoDb.pdf"))
expect_that(extendedMapSeqlevels('seq2', 'UCSC', 'toy', verbose = FALSE), equals('seq2'))
expect_that(extendedMapSeqlevels('seq2', 'UCSC', 'toy', verbose = TRUE), shows_message())
expect_that(extendedMapSeqlevels('2', 'toyStyle', 'Arabidopsis thaliana', verbose = FALSE), equals('2'))
expect_that(extendedMapSeqlevels('2', 'toyStyle', 'Arabidopsis thaliana', verbose = TRUE), shows_message())
expect_that(extendedMapSeqlevels('2', 'NCBI', 'Arabidopsis thaliana', 'toyStyle', verbose = FALSE), equals('2'))
expect_that(extendedMapSeqlevels('2', 'NCBI', 'Arabidopsis thaliana', 'toyStyle', verbose = TRUE), shows_message(""))
expect_that(extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', 'NCBI'), equals('chr2'))
expect_that(extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', verbose = TRUE), shows_message("extendedMapSeqlevels: sequence names mapped from NCBI to UCSC for species homo_sapiens"))
expect_that(extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', 'NCBI'), equals(extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', verbose = FALSE)))
expect_that(extendedMapSeqlevels('2', 'UCSC', 'Homo sapiens', 'NCBI'), equals(extendedMapSeqlevels('2', 'UCSC', 'homo_saPiens', 'NcBI')))
expect_that(extendedMapSeqlevels('28', NULL), equals('28'))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment