Last active
January 23, 2017 08:34
-
-
Save ibartomeus/b61be59e317d0a1e213814b2f1a1c776 to your computer and use it in GitHub Desktop.
Cleaning species taxonomy using taxize. I want to correct synonyms and typo's and drop incomplete cases.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I have >1000 bees to check its name, so I want to automatize taxize for | |
# fixing misspellings when possible | |
# updating synonims to accepted names | |
# keeping ONLY accepted species (fully resolved at species level) | |
# this uses taxize > 0.7.6.9157 If you are using older version (e.g. what its now on CRAN) see the history of this file. | |
library(taxize) | |
library(dplyr) | |
#example: good, synomin, typo, unexisting, genus only. | |
species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa", | |
"Osmia wikifluqie", "Osmia sp.") | |
clean_species <- function(species, verbose = FALSE){ #Add Verbose option to reduce printed output. | |
#misspellings | |
species2 <- unique(species) #how to keep track of this? | |
temp <- gnr_resolve(species2, best_match_only = TRUE, canonical = TRUE) | |
dat <- merge(data.frame(species2), temp[,c("user_supplied_name", "matched_name2")], | |
by.x = "species2", by.y = "user_supplied_name", all.x = TRUE) | |
#synonims | |
#here we ca save time by re-ding a unique() and removing NA's | |
species3 <- unique(dat$matched_name2) | |
species3 <- species3[!is.na(species3)] | |
temp <- synonyms(species3, db="itis") | |
synonym_ids <- grep(pattern = "acc_name", temp) #is this the optimal solution? | |
accepted_names <- unlist(lapply(temp[synonym_ids], '[', "acc_name"), | |
use.names = FALSE) | |
synonym_names <- species3 | |
synonym_names[synonym_ids] <- accepted_names[1] | |
key <- data.frame(species3, synonym_names) | |
dat <- merge(dat, key, | |
by.x = "matched_name2", by.y = "species3", all.x = TRUE) | |
#clean non accepted species | |
species4 <- unique(dat$synonym_names) | |
species4 <- species4[!is.na(species4)] | |
out2 <- tax_name(species4, get = "species", db = "both", pref = "itis", verbose = verbose) | |
out2_u <- unique(out2$species) | |
final_names <- species4 | |
final_names[which(!species4 %in% out2_u)] <- NA | |
key2 <- data.frame(species4, final_names) | |
dat <- merge(dat, key2, | |
by.x = "synonym_names", by.y = "species4", all.x = TRUE) | |
#output | |
dat <- merge(data.frame(species), dat, by.x = "species", by.y = "species2", | |
all.x = TRUE) | |
dat[,c(1,3,2,4)] | |
} | |
clean_species(species) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment