Skip to content

Instantly share code, notes, and snippets.

@MarinaGolivets
Last active October 1, 2023 15:59
Show Gist options
  • Save MarinaGolivets/a126ab6111c7f815c4ed9ae24fde8730 to your computer and use it in GitHub Desktop.
Save MarinaGolivets/a126ab6111c7f815c4ed9ae24fde8730 to your computer and use it in GitHub Desktop.
An R function for standardising plant taxon names against the GBIF taxonomic backbone
# as input, provide a vector of verbatim taxon names (preferably with authorship)
# and a vector of existing local identifiers for those names
# load R packages
library(dplyr)
library(tidyr)
match_to_gbif.fn <- function(taxon_name, taxon_id, include_genus = FALSE) {
# perform initial matching in parallel
no_cores <- parallel::detectCores()
cl <- parallel::makeCluster(no_cores)
all_matches <- pbapply::pblapply(
taxon_name,
rgbif::name_backbone_verbose,
kingdom = "plants", strict = TRUE, cl = cl
)
parallel::stopCluster(cl)
# retrieve alternative matches
alternative_matches <- lapply(
all_matches,
function(x) {
y <- x$alternatives
if (nrow(y) == 0) {
y[1, 1] <- NA
colnames(y) <- "usageKey"
} else {
y <- y
}
return(y)
}
) %>%
mapply(
cbind, .,
taxon_name = taxon_name, taxon_id = taxon_id,
stringsAsFactors = FALSE, SIMPLIFY = FALSE
) %>%
data.table::rbindlist(fill = TRUE) %>%
filter(!is.na(usageKey)) %>%
distinct()
# retrieve best matches
best_matches <- lapply(all_matches, function(x) x$data) %>%
mapply(
cbind, .,
taxon_name = taxon_name, taxon_id = taxon_id,
stringsAsFactors = FALSE, SIMPLIFY = FALSE
) %>%
data.table::rbindlist(fill = TRUE) %>%
distinct()
matched <- best_matches %>%
filter(!(matchType %in% c("NONE", "HIGHERRANK")))
nonmatched <- best_matches %>%
filter(matchType %in% c("NONE", "HIGHERRANK"))
matched_alternative <- try(
alternative_matches %>%
filter(phylum == "Tracheophyta") %>% # use only vascular plants
filter(confidence >= 0) %>%
filter(taxon_id %in% nonmatched$taxon_id)
)
if (class(matched_alternative)[1] == "try-error") {
taxon_list <- matched
} else {
taxon_list <- bind_rows(matched, matched_alternative)
}
if (include_genus == FALSE) taxon_list %<>% filter(rank != "GENUS")
# get names that were matched as accepted
accepted <- taxon_list %>%
group_by(taxon_id) %>%
filter(status == "ACCEPTED") %>%
filter(confidence == max(confidence)) %>%
ungroup()
# get names that were matched as synonyms only
synonyms <- taxon_list %>%
group_by(taxon_id) %>%
summarise(has_accepted = n_distinct(status == "ACCEPTED") > 1) %>%
full_join(taxon_list) %>%
filter(has_accepted == FALSE) %>%
filter(status == "SYNONYM") %>%
group_by(taxon_id) %>%
filter(confidence == max(confidence)) %>%
ungroup()
# get names that were matched as doubtful only
doubtful <- taxon_list %>%
group_by(taxon_id) %>%
summarise(has_accepted = n_distinct(status == "ACCEPTED") > 1) %>%
full_join(taxon_list) %>%
filter(has_accepted == FALSE) %>%
group_by(taxon_id) %>%
filter(status == "DOUBTFUL") %>%
filter(confidence == max(confidence)) %>%
ungroup()
# combine all names
taxon_list_final <- bind_rows(accepted, synonyms, doubtful) %>%
group_by(taxon_id) %>%
filter(confidence == max(confidence)) %>%
filter(status != "NONE") %>% # exclude non-matched names
dplyr::select(-has_accepted) %>%
ungroup() %>%
relocate(taxon_name, taxon_id)
return(taxon_list_final)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment