ibartomeus/clean_species

## clean_species
#I have >1000 bees to check its name, so I want to automatize taxize for
# fixing misspellings when possible
# updating synonims to accepted names
# keeping ONLY accepted species (fully resolved at species level)

# this uses taxize > 0.7.6.9157 If you are using older version (e.g. what its now on CRAN) see the history of this file.
library(taxize)
library(dplyr)

#example: good, synomin, typo, unexisting, genus only.
species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa",
             "Osmia wikifluqie", "Osmia sp.")

clean_species <- function(species, verbose = FALSE){ #Add Verbose option to reduce printed output.
    #misspellings
    species2 <- unique(species) #how to keep track of this?
    temp <- gnr_resolve(species2, best_match_only = TRUE, canonical = TRUE)
    dat <- merge(data.frame(species2), temp[,c("user_supplied_name", "matched_name2")],
                      by.x = "species2", by.y = "user_supplied_name", all.x = TRUE)
    #synonims
    #here we ca save time by re-ding a unique() and removing NA's
    species3 <- unique(dat$matched_name2)
    species3 <- species3[!is.na(species3)]
    temp <- synonyms(species3, db="itis")
    synonym_ids <- grep(pattern = "acc_name", temp) #is this the optimal solution?
    accepted_names <- unlist(lapply(temp[synonym_ids], '[', "acc_name"),
                             use.names = FALSE)
    synonym_names <- species3
    synonym_names[synonym_ids] <- accepted_names[1]
    key <- data.frame(species3, synonym_names)
    dat <- merge(dat, key,
                      by.x = "matched_name2", by.y = "species3", all.x = TRUE)
    #clean non accepted species
    species4 <- unique(dat$synonym_names)
    species4 <- species4[!is.na(species4)]
    out2 <- tax_name(species4, get = "species", db = "both", pref = "itis", verbose = verbose)
    out2_u <- unique(out2$species)
    final_names <- species4
    final_names[which(!species4 %in% out2_u)] <- NA
    key2 <- data.frame(species4, final_names)
    dat <- merge(dat, key2,
                 by.x = "synonym_names", by.y = "species4", all.x = TRUE)
    #output
    dat <- merge(data.frame(species), dat, by.x = "species", by.y = "species2",
          all.x = TRUE)
    dat[,c(1,3,2,4)]
}


clean_species(species)
	#I have >1000 bees to check its name, so I want to automatize taxize for
	# fixing misspellings when possible
	# updating synonims to accepted names
	# keeping ONLY accepted species (fully resolved at species level)

	# this uses taxize > 0.7.6.9157 If you are using older version (e.g. what its now on CRAN) see the history of this file.
	library(taxize)
	library(dplyr)

	#example: good, synomin, typo, unexisting, genus only.
	species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa",
	"Osmia wikifluqie", "Osmia sp.")

	clean_species <- function(species, verbose = FALSE){ #Add Verbose option to reduce printed output.
	#misspellings
	species2 <- unique(species) #how to keep track of this?
	temp <- gnr_resolve(species2, best_match_only = TRUE, canonical = TRUE)
	dat <- merge(data.frame(species2), temp[,c("user_supplied_name", "matched_name2")],
	by.x = "species2", by.y = "user_supplied_name", all.x = TRUE)
	#synonims
	#here we ca save time by re-ding a unique() and removing NA's
	species3 <- unique(dat$matched_name2)
	species3 <- species3[!is.na(species3)]
	temp <- synonyms(species3, db="itis")
	synonym_ids <- grep(pattern = "acc_name", temp) #is this the optimal solution?
	accepted_names <- unlist(lapply(temp[synonym_ids], '[', "acc_name"),
	use.names = FALSE)
	synonym_names <- species3
	synonym_names[synonym_ids] <- accepted_names[1]
	key <- data.frame(species3, synonym_names)
	dat <- merge(dat, key,
	by.x = "matched_name2", by.y = "species3", all.x = TRUE)
	#clean non accepted species
	species4 <- unique(dat$synonym_names)
	species4 <- species4[!is.na(species4)]
	out2 <- tax_name(species4, get = "species", db = "both", pref = "itis", verbose = verbose)
	out2_u <- unique(out2$species)
	final_names <- species4
	final_names[which(!species4 %in% out2_u)] <- NA
	key2 <- data.frame(species4, final_names)
	dat <- merge(dat, key2,
	by.x = "synonym_names", by.y = "species4", all.x = TRUE)
	#output
	dat <- merge(data.frame(species), dat, by.x = "species", by.y = "species2",
	all.x = TRUE)
	dat[,c(1,3,2,4)]
	}


	clean_species(species)