Skip to content

Instantly share code, notes, and snippets.

@ibartomeus
Last active January 23, 2017 08:34
Show Gist options
  • Save ibartomeus/b61be59e317d0a1e213814b2f1a1c776 to your computer and use it in GitHub Desktop.
Save ibartomeus/b61be59e317d0a1e213814b2f1a1c776 to your computer and use it in GitHub Desktop.
Cleaning species taxonomy using taxize. I want to correct synonyms and typo's and drop incomplete cases.
#I have >1000 bees to check its name, so I want to automatize taxize for
# fixing misspellings when possible
# updating synonims to accepted names
# keeping ONLY accepted species (fully resolved at species level)
# this uses taxize > 0.7.6.9157 If you are using older version (e.g. what its now on CRAN) see the history of this file.
library(taxize)
library(dplyr)
#example: good, synomin, typo, unexisting, genus only.
species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa",
"Osmia wikifluqie", "Osmia sp.")
clean_species <- function(species, verbose = FALSE){ #Add Verbose option to reduce printed output.
#misspellings
species2 <- unique(species) #how to keep track of this?
temp <- gnr_resolve(species2, best_match_only = TRUE, canonical = TRUE)
dat <- merge(data.frame(species2), temp[,c("user_supplied_name", "matched_name2")],
by.x = "species2", by.y = "user_supplied_name", all.x = TRUE)
#synonims
#here we ca save time by re-ding a unique() and removing NA's
species3 <- unique(dat$matched_name2)
species3 <- species3[!is.na(species3)]
temp <- synonyms(species3, db="itis")
synonym_ids <- grep(pattern = "acc_name", temp) #is this the optimal solution?
accepted_names <- unlist(lapply(temp[synonym_ids], '[', "acc_name"),
use.names = FALSE)
synonym_names <- species3
synonym_names[synonym_ids] <- accepted_names[1]
key <- data.frame(species3, synonym_names)
dat <- merge(dat, key,
by.x = "matched_name2", by.y = "species3", all.x = TRUE)
#clean non accepted species
species4 <- unique(dat$synonym_names)
species4 <- species4[!is.na(species4)]
out2 <- tax_name(species4, get = "species", db = "both", pref = "itis", verbose = verbose)
out2_u <- unique(out2$species)
final_names <- species4
final_names[which(!species4 %in% out2_u)] <- NA
key2 <- data.frame(species4, final_names)
dat <- merge(dat, key2,
by.x = "synonym_names", by.y = "species4", all.x = TRUE)
#output
dat <- merge(data.frame(species), dat, by.x = "species", by.y = "species2",
all.x = TRUE)
dat[,c(1,3,2,4)]
}
clean_species(species)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment