joelnitta/parse_ncbi_tax_record.R

## parse_ncbi_tax_record.R
library(tidyverse)
library(assertr)

# Define custom parsing function

#' Parse a single record from the NCBI taxonomy database
#'
#' The NCBI taxonomy database contains names in the `names.dmp` file.
#' A single record (corresponding to one `taxid`) looks like this:
#'
#' # A tibble: 4 × 3
#'   taxid  name                                               class
#'   <chr>  <chr>                                              <chr>
#' 1 857989 Alansmia glandulifera (A.Rojas) Moguel & M.Kessler authority
#' 2 857989 Alansmia glandulifera                              scientific name
#' 3 857989 Terpsichore glandulifera A.Rojas                   authority
#' 4 857989 Terpsichore glandulifera                           synonym
#'
#' @param record Tibble (dataframe); a single record of NCBI taxonomy data
#'
#' @return Tibble; parsed data with columns "species", "accepted", and
#'   "scientific_name"
#' @examples
#' record <- tribble(
#'   ~taxid, ~name, ~class,
#'   "857989", "Alansmia glandulifera (A.Rojas) Moguel & M.Kessler", "authority", #nolint
#'   "857989", "Alansmia glandulifera", "scientific name",
#'   "857989", "Terpsichore glandulifera A.Rojas", "authority",
#'   "857989", "Terpsichore glandulifera", "synonym"
#' )
#' parse_ncbi_tax_record(record)
parse_ncbi_tax_record <- function(record) {

   # Each record should have exactly 1 accepted species name
   accepted_species <- record %>%
      filter(class == "scientific name") %>%
      pull(name)
   assertthat::assert_that(
      length(accepted_species) == 1,
      msg = "Not exactly 1 accepted scientific name detected")
   # Confusingly, "synonyms" may sometimes contain the accepted name :/
   synonyms <- record %>%
      filter(class == "synonym") %>%
      pull(name)
   # Scientific names (with author) have the class "authority"
   sci_names <- record %>%
      filter(class == "authority") %>%
      pull(name)

   # The results should always have at least taxon ID and species
   species_dat <- tibble(species = accepted_species, accepted = TRUE)

   # Create empty tibbles to hold other name data
   acc_sci_names_dat <- tibble()
   syn_sci_names_dat <- tibble()

   # If other sci names are given, one of them should be the species name
   if (length(sci_names) > 0)
      acc_sci_names_dat <- tibble(scientific_name = sci_names) %>%
      mutate(species = str_extract(
         scientific_name, stringr::fixed(accepted_species))) %>%
      filter(!is.na(species)) %>%
      mutate(accepted = TRUE)

   # If "synonym" and other sci names are given, one (or more) of them are
   # the synonym
   if (length(synonyms) > 0 && length(sci_names) > 0)
      syn_sci_names_dat <- tibble(scientific_name = sci_names) %>%
      # Each sci name should only correspond to max. one synonym
      mutate(species = str_extract_first_target(scientific_name, synonyms)) %>%
      filter(!is.na(species)) %>%
      mutate(accepted = FALSE)

   # If "synonym" is present but no other sci names are given, "synonym" is
   # actually the scientific name of the species
   if (length(synonyms) > 0 && length(sci_names) == 0)
      acc_sci_names_dat <- tibble(scientific_name = synonyms) %>%
      mutate(species = str_extract(
         scientific_name, stringr::fixed(accepted_species))) %>%
      filter(!is.na(species)) %>%
      mutate(accepted = TRUE)

   # Combine scientific names of synonyms and accepted names
   combined_sci_names_dat <- bind_rows(syn_sci_names_dat, acc_sci_names_dat)

   # Join to accepted species with taxon ID
   if (nrow(combined_sci_names_dat) > 0)
      species_dat <- full_join(
         species_dat, combined_sci_names_dat, by = c("species", "accepted"))

   species_dat
}

# To use parse_ncbi_tax_record(), first need to do some pre-processing

# First download taxdump file (old format) from NCBI FTP site
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
# e.g.,
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

taxdump_zip_file <- "path_to_taxdump.tar.gz"

# Unzip names.dmp to a temporary directory
temp_dir <- tempdir(check = TRUE)

utils::unzip(
  taxdump_zip_file, files = "names.dmp",
  overwrite = TRUE, junkpaths = TRUE, exdir = temp_dir)

# Load raw NCBI data
ncbi_raw <-
  fs::path(temp_dir, "names.dmp") %>%
  readr::read_delim(
    delim = "\t|\t", col_names = FALSE,
    col_types = cols(.default = col_character())
  )

# Delete temporary unzipped file
 fs::file_delete(fs::path(temp_dir, "names.dmp"))

ncbi_names <-
  ncbi_raw %>%
  # Select only needed columns
  transmute(
    taxid = as.character(X1),
    name = X2,
    class = X4) %>%
  # Make sure there are no hidden fields in `class`
    verify(all(str_count(class, "\\|") == 1)) %>%
  # Drop field separators in `class`
    mutate(class = str_remove_all(class, "\\\t\\|")) %>%
  # Only keep useful names: exclude common names,
  # alternative spellings (`equivalent name`), type material,
  # temporary names with 'sp.' (`includes`)
    filter(class %in% c("authority", "scientific name", "synonym"))

# parse names
ncbi_names_parsed <-
  ncbi_names %>%
  group_by(taxid) %>%
  nest() %>%
  ungroup() %>%
  mutate(
    data = map(data, parse_ncbi_tax_record)
  ) %>%
  unnest(data)
	library(tidyverse)
	library(assertr)

	# Define custom parsing function

	#' Parse a single record from the NCBI taxonomy database
	#'
	#' The NCBI taxonomy database contains names in the `names.dmp` file.
	#' A single record (corresponding to one `taxid`) looks like this:
	#'
	#' # A tibble: 4 × 3
	#' taxid name class
	#' <chr> <chr> <chr>
	#' 1 857989 Alansmia glandulifera (A.Rojas) Moguel & M.Kessler authority
	#' 2 857989 Alansmia glandulifera scientific name
	#' 3 857989 Terpsichore glandulifera A.Rojas authority
	#' 4 857989 Terpsichore glandulifera synonym
	#'
	#' @param record Tibble (dataframe); a single record of NCBI taxonomy data
	#'
	#' @return Tibble; parsed data with columns "species", "accepted", and
	#' "scientific_name"
	#' @examples
	#' record <- tribble(
	#' ~taxid, ~name, ~class,
	#' "857989", "Alansmia glandulifera (A.Rojas) Moguel & M.Kessler", "authority", #nolint
	#' "857989", "Alansmia glandulifera", "scientific name",
	#' "857989", "Terpsichore glandulifera A.Rojas", "authority",
	#' "857989", "Terpsichore glandulifera", "synonym"
	#' )
	#' parse_ncbi_tax_record(record)
	parse_ncbi_tax_record <- function(record) {

	# Each record should have exactly 1 accepted species name
	accepted_species <- record %>%
	filter(class == "scientific name") %>%
	pull(name)
	assertthat::assert_that(
	length(accepted_species) == 1,
	msg = "Not exactly 1 accepted scientific name detected")
	# Confusingly, "synonyms" may sometimes contain the accepted name :/
	synonyms <- record %>%
	filter(class == "synonym") %>%
	pull(name)
	# Scientific names (with author) have the class "authority"
	sci_names <- record %>%
	filter(class == "authority") %>%
	pull(name)

	# The results should always have at least taxon ID and species
	species_dat <- tibble(species = accepted_species, accepted = TRUE)

	# Create empty tibbles to hold other name data
	acc_sci_names_dat <- tibble()
	syn_sci_names_dat <- tibble()

	# If other sci names are given, one of them should be the species name
	if (length(sci_names) > 0)
	acc_sci_names_dat <- tibble(scientific_name = sci_names) %>%
	mutate(species = str_extract(
	scientific_name, stringr::fixed(accepted_species))) %>%
	filter(!is.na(species)) %>%
	mutate(accepted = TRUE)

	# If "synonym" and other sci names are given, one (or more) of them are
	# the synonym
	if (length(synonyms) > 0 && length(sci_names) > 0)
	syn_sci_names_dat <- tibble(scientific_name = sci_names) %>%
	# Each sci name should only correspond to max. one synonym
	mutate(species = str_extract_first_target(scientific_name, synonyms)) %>%
	filter(!is.na(species)) %>%
	mutate(accepted = FALSE)

	# If "synonym" is present but no other sci names are given, "synonym" is
	# actually the scientific name of the species
	if (length(synonyms) > 0 && length(sci_names) == 0)
	acc_sci_names_dat <- tibble(scientific_name = synonyms) %>%
	mutate(species = str_extract(
	scientific_name, stringr::fixed(accepted_species))) %>%
	filter(!is.na(species)) %>%
	mutate(accepted = TRUE)

	# Combine scientific names of synonyms and accepted names
	combined_sci_names_dat <- bind_rows(syn_sci_names_dat, acc_sci_names_dat)

	# Join to accepted species with taxon ID
	if (nrow(combined_sci_names_dat) > 0)
	species_dat <- full_join(
	species_dat, combined_sci_names_dat, by = c("species", "accepted"))

	species_dat
	}

	# To use parse_ncbi_tax_record(), first need to do some pre-processing

	# First download taxdump file (old format) from NCBI FTP site
	# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
	# e.g.,
	# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

	taxdump_zip_file <- "path_to_taxdump.tar.gz"

	# Unzip names.dmp to a temporary directory
	temp_dir <- tempdir(check = TRUE)

	utils::unzip(
	taxdump_zip_file, files = "names.dmp",
	overwrite = TRUE, junkpaths = TRUE, exdir = temp_dir)

	# Load raw NCBI data
	ncbi_raw <-
	fs::path(temp_dir, "names.dmp") %>%
	readr::read_delim(
	delim = "\t\|\t", col_names = FALSE,
	col_types = cols(.default = col_character())
	)

	# Delete temporary unzipped file
	fs::file_delete(fs::path(temp_dir, "names.dmp"))

	ncbi_names <-
	ncbi_raw %>%
	# Select only needed columns
	transmute(
	taxid = as.character(X1),
	name = X2,
	class = X4) %>%
	# Make sure there are no hidden fields in `class`
	verify(all(str_count(class, "\\\|") == 1)) %>%
	# Drop field separators in `class`
	mutate(class = str_remove_all(class, "\\\t\\\|")) %>%
	# Only keep useful names: exclude common names,
	# alternative spellings (`equivalent name`), type material,
	# temporary names with 'sp.' (`includes`)
	filter(class %in% c("authority", "scientific name", "synonym"))

	# parse names
	ncbi_names_parsed <-
	ncbi_names %>%
	group_by(taxid) %>%
	nest() %>%
	ungroup() %>%
	mutate(
	data = map(data, parse_ncbi_tax_record)
	) %>%
	unnest(data)