Skip to content

Instantly share code, notes, and snippets.

@joelnitta
Last active October 25, 2022 11:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joelnitta/fcd248ce3ba87aa264eac5fcdac9287b to your computer and use it in GitHub Desktop.
Save joelnitta/fcd248ce3ba87aa264eac5fcdac9287b to your computer and use it in GitHub Desktop.
R code for extracting NCBI taxonomy data
library(tidyverse)
library(assertr)
# Define custom parsing function
#' Parse a single record from the NCBI taxonomy database
#'
#' The NCBI taxonomy database contains names in the `names.dmp` file.
#' A single record (corresponding to one `taxid`) looks like this:
#'
#' # A tibble: 4 × 3
#' taxid name class
#' <chr> <chr> <chr>
#' 1 857989 Alansmia glandulifera (A.Rojas) Moguel & M.Kessler authority
#' 2 857989 Alansmia glandulifera scientific name
#' 3 857989 Terpsichore glandulifera A.Rojas authority
#' 4 857989 Terpsichore glandulifera synonym
#'
#' @param record Tibble (dataframe); a single record of NCBI taxonomy data
#'
#' @return Tibble; parsed data with columns "species", "accepted", and
#' "scientific_name"
#' @examples
#' record <- tribble(
#' ~taxid, ~name, ~class,
#' "857989", "Alansmia glandulifera (A.Rojas) Moguel & M.Kessler", "authority", #nolint
#' "857989", "Alansmia glandulifera", "scientific name",
#' "857989", "Terpsichore glandulifera A.Rojas", "authority",
#' "857989", "Terpsichore glandulifera", "synonym"
#' )
#' parse_ncbi_tax_record(record)
parse_ncbi_tax_record <- function(record) {
# Each record should have exactly 1 accepted species name
accepted_species <- record %>%
filter(class == "scientific name") %>%
pull(name)
assertthat::assert_that(
length(accepted_species) == 1,
msg = "Not exactly 1 accepted scientific name detected")
# Confusingly, "synonyms" may sometimes contain the accepted name :/
synonyms <- record %>%
filter(class == "synonym") %>%
pull(name)
# Scientific names (with author) have the class "authority"
sci_names <- record %>%
filter(class == "authority") %>%
pull(name)
# The results should always have at least taxon ID and species
species_dat <- tibble(species = accepted_species, accepted = TRUE)
# Create empty tibbles to hold other name data
acc_sci_names_dat <- tibble()
syn_sci_names_dat <- tibble()
# If other sci names are given, one of them should be the species name
if (length(sci_names) > 0)
acc_sci_names_dat <- tibble(scientific_name = sci_names) %>%
mutate(species = str_extract(
scientific_name, stringr::fixed(accepted_species))) %>%
filter(!is.na(species)) %>%
mutate(accepted = TRUE)
# If "synonym" and other sci names are given, one (or more) of them are
# the synonym
if (length(synonyms) > 0 && length(sci_names) > 0)
syn_sci_names_dat <- tibble(scientific_name = sci_names) %>%
# Each sci name should only correspond to max. one synonym
mutate(species = str_extract_first_target(scientific_name, synonyms)) %>%
filter(!is.na(species)) %>%
mutate(accepted = FALSE)
# If "synonym" is present but no other sci names are given, "synonym" is
# actually the scientific name of the species
if (length(synonyms) > 0 && length(sci_names) == 0)
acc_sci_names_dat <- tibble(scientific_name = synonyms) %>%
mutate(species = str_extract(
scientific_name, stringr::fixed(accepted_species))) %>%
filter(!is.na(species)) %>%
mutate(accepted = TRUE)
# Combine scientific names of synonyms and accepted names
combined_sci_names_dat <- bind_rows(syn_sci_names_dat, acc_sci_names_dat)
# Join to accepted species with taxon ID
if (nrow(combined_sci_names_dat) > 0)
species_dat <- full_join(
species_dat, combined_sci_names_dat, by = c("species", "accepted"))
species_dat
}
# To use parse_ncbi_tax_record(), first need to do some pre-processing
# First download taxdump file (old format) from NCBI FTP site
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
# e.g.,
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
taxdump_zip_file <- "path_to_taxdump.tar.gz"
# Unzip names.dmp to a temporary directory
temp_dir <- tempdir(check = TRUE)
utils::unzip(
taxdump_zip_file, files = "names.dmp",
overwrite = TRUE, junkpaths = TRUE, exdir = temp_dir)
# Load raw NCBI data
ncbi_raw <-
fs::path(temp_dir, "names.dmp") %>%
readr::read_delim(
delim = "\t|\t", col_names = FALSE,
col_types = cols(.default = col_character())
)
# Delete temporary unzipped file
fs::file_delete(fs::path(temp_dir, "names.dmp"))
ncbi_names <-
ncbi_raw %>%
# Select only needed columns
transmute(
taxid = as.character(X1),
name = X2,
class = X4) %>%
# Make sure there are no hidden fields in `class`
verify(all(str_count(class, "\\|") == 1)) %>%
# Drop field separators in `class`
mutate(class = str_remove_all(class, "\\\t\\|")) %>%
# Only keep useful names: exclude common names,
# alternative spellings (`equivalent name`), type material,
# temporary names with 'sp.' (`includes`)
filter(class %in% c("authority", "scientific name", "synonym"))
# parse names
ncbi_names_parsed <-
ncbi_names %>%
group_by(taxid) %>%
nest() %>%
ungroup() %>%
mutate(
data = map(data, parse_ncbi_tax_record)
) %>%
unnest(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment