-
-
Save leonardonormando/241fc527addfd10a61c231bab3088268 to your computer and use it in GitHub Desktop.
Parsing .biom files with SILVA formatted annotations using phyloseq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @examples | |
#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") | |
#' parse_taxonomy_default(taxvec1) | |
parse_taxonomy_default = function(char.vec){ | |
# Remove any leading empty space | |
char.vec = gsub("^[[:space:]]{1,}", "", char.vec) | |
# Remove any trailing space | |
char.vec = gsub("[[:space:]]{1,}$", "", char.vec) | |
if( length(char.vec) > 0 ){ | |
# Add dummy element (rank) name | |
names(char.vec) = paste("Rank", 1:length(char.vec), sep="") | |
} else { | |
warning("Empty taxonomy vector encountered.") | |
} | |
return(char.vec) | |
} | |
#' @examples | |
#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae") | |
#' parse_taxonomy_silva(taxvec3) | |
parse_taxonomy_silva_128 <- function(char.vec){ | |
# Use default to assign names to elements in case problem with greengenes prefix | |
char.vec = parse_taxonomy_default(char.vec) | |
# Check for unassigned taxa | |
if (char.vec["Rank1"] == "Unassigned") { | |
char.vec <- c(Rank1="D_0__Unassigned", Rank2="D_1__Unassigned", Rank3="D_2__Unassigned", Rank4="D_3__Unassigned", | |
Rank5="D_4__Unassigned", Rank6="D_5__Unassigned", Rank7="D_6__Unassigned") | |
} | |
# Define the meaning of each prefix according to SILVA taxonomy | |
Tranks = c(D_0="Domain", D_1="Phylum", D_2="Class", D_3="Order", D_4="Family", D_5="Genus", D_6="Species") | |
# Check for prefix using regexp, warn if there were none. trim indices, ti | |
ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec) | |
if( length(ti) == 0L ){ | |
warning( | |
"No silva prefixes were found. \n", | |
"Consider using parse_taxonomy_delfault() instead if true for all OTUs. \n", | |
"Dummy ranks may be included among taxonomic ranks now." | |
) | |
# Will want to return without further modifying char.vec | |
taxvec = char.vec | |
# Replace names of taxvec according to prefix, if any present... | |
} else { | |
# Format character vectors for Ambiguous taxa | |
if( length(ti) < 7 ){ | |
for (key in names(char.vec)) { | |
if ( char.vec[key] == "Ambiguous_taxa" ) { | |
tax_no <- (as.numeric(substr(key, 5, 5)) - 1) | |
char.vec[key] = sprintf("D_%s__Ambiguous_taxa", tax_no) | |
} | |
} | |
# Reset the trimmed indicies if Ambiguous taxa | |
ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec) | |
} | |
# Remove prefix using sub-"" regexp, call result taxvec | |
taxvec = gsub("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", "", char.vec) | |
# Define the ranks that will be replaced | |
repranks = Tranks[substr(char.vec[ti], 1, 3)] | |
# Replace, being sure to avoid prefixes notK present in Tranks | |
names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)] | |
} | |
return(taxvec) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This functionality has been tested and a PR has been pulled with phyloseq here: | |
# https://github.com/joey711/phyloseq/pull/854 | |
# | |
# While the function has been vetted, the maintainers are very busy and the PR has not | |
# yet been added to the main package. Below i've added some detail to explain how to parse | |
# your silva data. It's quite easy.... | |
# Save the R script in a directory and source it | |
source("parse_silva_taxonomy_128") | |
# import biom data | |
silva_biom <- system.file("extdata", "SILVA_OTU_table.biom", package="phyloseq") | |
# Create phyloseq object using silva parseing function | |
silva_phyloseq <- import_biom(BIOMfilename = silva_biom, parseFunction = parse_taxonomy_silva_128) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment