leonardonormando/SILVA_walkthrough.R

## parse_silva_taxonomy_128.R
#' @examples
#'  taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
#'  parse_taxonomy_default(taxvec1)
parse_taxonomy_default = function(char.vec){
	# Remove any leading empty space
	char.vec = gsub("^[[:space:]]{1,}", "", char.vec)
	# Remove any trailing space
	char.vec = gsub("[[:space:]]{1,}$", "", char.vec)
	if( length(char.vec) > 0 ){
		# Add dummy element (rank) name
		names(char.vec) = paste("Rank", 1:length(char.vec), sep="")
	} else {
		warning("Empty taxonomy vector encountered.")
	}
	return(char.vec)
}

#' @examples
#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae")
#' parse_taxonomy_silva(taxvec3)
parse_taxonomy_silva_128 <- function(char.vec){
  # Use default to assign names to elements in case problem with greengenes prefix
  char.vec = parse_taxonomy_default(char.vec)
  # Check for unassigned taxa
  if (char.vec["Rank1"] == "Unassigned") {
    char.vec <- c(Rank1="D_0__Unassigned", Rank2="D_1__Unassigned", Rank3="D_2__Unassigned", Rank4="D_3__Unassigned",
                  Rank5="D_4__Unassigned", Rank6="D_5__Unassigned", Rank7="D_6__Unassigned")
  }
  # Define the meaning of each prefix according to SILVA taxonomy
  Tranks = c(D_0="Domain", D_1="Phylum", D_2="Class", D_3="Order", D_4="Family", D_5="Genus", D_6="Species")
  # Check for prefix using regexp, warn if there were none. trim indices, ti
  ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
  if( length(ti) == 0L ){
    warning(
      "No silva prefixes were found. \n",
      "Consider using parse_taxonomy_delfault() instead if true for all OTUs. \n",
      "Dummy ranks may be included among taxonomic ranks now."
    )
    # Will want to return without further modifying char.vec
    taxvec = char.vec
    # Replace names of taxvec according to prefix, if any present...
  } else {
    # Format character vectors for Ambiguous taxa
    if( length(ti) < 7 ){
        for (key in names(char.vec)) {
          if ( char.vec[key] == "Ambiguous_taxa" ) {
            tax_no <- (as.numeric(substr(key, 5, 5)) - 1)
            char.vec[key] = sprintf("D_%s__Ambiguous_taxa", tax_no)
          }
        }
      # Reset the trimmed indicies if Ambiguous taxa
      ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
    }
    # Remove prefix using sub-"" regexp, call result taxvec
    taxvec = gsub("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", "", char.vec)
    # Define the ranks that will be replaced
    repranks = Tranks[substr(char.vec[ti], 1, 3)]
    # Replace, being sure to avoid prefixes notK present in Tranks
    names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)]
  }
  return(taxvec)
}

## SILVA_walkthrough.R
# This functionality has been tested and a PR has been pulled with phyloseq here:
# https://github.com/joey711/phyloseq/pull/854
#
# While the function has been vetted, the maintainers are very busy and the PR has not
# yet been added to the main package.  Below i've added some detail to explain how to parse
# your silva data.  It's quite easy....

# Save the R script in a directory and source it

source("parse_silva_taxonomy_128")

# import biom data
silva_biom <- system.file("extdata", "SILVA_OTU_table.biom", package="phyloseq")
# Create phyloseq object using silva parseing function
silva_phyloseq <- import_biom(BIOMfilename = silva_biom, parseFunction = parse_taxonomy_silva_128)
	#' @examples
	#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
	#' parse_taxonomy_default(taxvec1)
	parse_taxonomy_default = function(char.vec){
	# Remove any leading empty space
	char.vec = gsub("^[[:space:]]{1,}", "", char.vec)
	# Remove any trailing space
	char.vec = gsub("[[:space:]]{1,}$", "", char.vec)
	if( length(char.vec) > 0 ){
	# Add dummy element (rank) name
	names(char.vec) = paste("Rank", 1:length(char.vec), sep="")
	} else {
	warning("Empty taxonomy vector encountered.")
	}
	return(char.vec)
	}

	#' @examples
	#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae")
	#' parse_taxonomy_silva(taxvec3)
	parse_taxonomy_silva_128 <- function(char.vec){
	# Use default to assign names to elements in case problem with greengenes prefix
	char.vec = parse_taxonomy_default(char.vec)
	# Check for unassigned taxa
	if (char.vec["Rank1"] == "Unassigned") {
	char.vec <- c(Rank1="D_0__Unassigned", Rank2="D_1__Unassigned", Rank3="D_2__Unassigned", Rank4="D_3__Unassigned",
	Rank5="D_4__Unassigned", Rank6="D_5__Unassigned", Rank7="D_6__Unassigned")
	}
	# Define the meaning of each prefix according to SILVA taxonomy
	Tranks = c(D_0="Domain", D_1="Phylum", D_2="Class", D_3="Order", D_4="Family", D_5="Genus", D_6="Species")
	# Check for prefix using regexp, warn if there were none. trim indices, ti
	ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
	if( length(ti) == 0L ){
	warning(
	"No silva prefixes were found. \n",
	"Consider using parse_taxonomy_delfault() instead if true for all OTUs. \n",
	"Dummy ranks may be included among taxonomic ranks now."
	)
	# Will want to return without further modifying char.vec
	taxvec = char.vec
	# Replace names of taxvec according to prefix, if any present...
	} else {
	# Format character vectors for Ambiguous taxa
	if( length(ti) < 7 ){
	for (key in names(char.vec)) {
	if ( char.vec[key] == "Ambiguous_taxa" ) {
	tax_no <- (as.numeric(substr(key, 5, 5)) - 1)
	char.vec[key] = sprintf("D_%s__Ambiguous_taxa", tax_no)
	}
	}
	# Reset the trimmed indicies if Ambiguous taxa
	ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
	}
	# Remove prefix using sub-"" regexp, call result taxvec
	taxvec = gsub("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", "", char.vec)
	# Define the ranks that will be replaced
	repranks = Tranks[substr(char.vec[ti], 1, 3)]
	# Replace, being sure to avoid prefixes notK present in Tranks
	names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)]
	}
	return(taxvec)
	}
	# This functionality has been tested and a PR has been pulled with phyloseq here:
	# https://github.com/joey711/phyloseq/pull/854
	#
	# While the function has been vetted, the maintainers are very busy and the PR has not
	# yet been added to the main package. Below i've added some detail to explain how to parse
	# your silva data. It's quite easy....

	# Save the R script in a directory and source it

	source("parse_silva_taxonomy_128")

	# import biom data
	silva_biom <- system.file("extdata", "SILVA_OTU_table.biom", package="phyloseq")
	# Create phyloseq object using silva parseing function
	silva_phyloseq <- import_biom(BIOMfilename = silva_biom, parseFunction = parse_taxonomy_silva_128)