Created
December 30, 2014 04:42
-
-
Save daijiang/640e493015a1e1b41e32 to your computer and use it in GitHub Desktop.
Codes in this file can be used to download species informations from the [Herbarium of UWSP](http://wisplants.uwsp.edu/index.html) and check your species names.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Working with datasets with inconsistent species names is annoying. For example, I am working on resurvey Pine Barrens in central Wisconsin that had been sampled in 1958. However, some species names have been changed overtime. Thus, it is necessary to sync them before data analysis. There is an excellent R package `[taxize](http://ropensci.org/tutorials/taxize_tutorial.html)` for this. However, it is global based thus sometimes can be overwhelming. And I just want to sync my species names with names from the Wisconsin State Herbarium. | |
Until today, I checked species names manually, i.e. search website one by one. This will be fine if I only have one dataset to deal with. Last week, I was trying to build a functional trait dataset from different sources, e.g. carbon and nitrogen before 2013 in one spreadsheet and from this year in another; leaf area and SLA in another; plant height and flower height in another, etc. In addition, it is no clear that whether these spreadsheet have consistent species names! Plus, it is almost unavoidable that you will find typos in these species names. What should I do? As I said, `taxize` is overwhelming... So I decide to write some code to search each species names to: | |
1. check for spelling, and correct it; | |
2. check for synonyms and use the latest name. | |
```r | |
# pkgs needed | |
library(stringr) | |
library(reshape2) | |
library(plyr) | |
library(dplyr) | |
# give a web url for one species, this function will get info and synonyms for this species. | |
get_sp_detail = function(sp.url){ | |
spurl = readLines(sp.url) | |
# sp name and code | |
spname = str_replace(pattern = ".*<B>([&()-. A-Za-z]*)</B>.*", replacement = "\\1", | |
string = spurl[grep(pattern = "Familygenera.asp\\?Family", x = spurl)[1]+1]) | |
# synonyms | |
if(any(str_detect(string = spurl, pattern = "\"-1\""))){ | |
syn.names = str_replace(pattern = ".*>([&()-. A-Za-z]*)</FONT><BR>$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "\"-1\"")])} | |
else{syn.names = NA} | |
# status | |
if(any(str_detect(string = spurl, pattern = "Status:"))){ | |
x1 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Status:")])} else{ | |
x1 = NA | |
} | |
# plant descrip | |
if(any(str_detect(string = spurl, pattern = "Plant:"))){ | |
x2 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Plant:")])} else( | |
x2 = NA) | |
# Flower | |
if(any(str_detect(string = spurl, pattern = "Flower:"))){ | |
x3 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Flower:")])} else{ | |
x3 = NA | |
} | |
# Fruit | |
if(any(str_detect(string = spurl, pattern = "Fruit:"))){ | |
x4 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Fruit:")])} else{ | |
x4 = NA | |
} | |
# Leaf | |
if(any(str_detect(string = spurl, pattern = "Leaf:"))){ | |
x5 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Leaf:")])} else{ | |
x5 = NA | |
} | |
# Habitat | |
if(any(str_detect(string = spurl, pattern = "Habitat:"))){ | |
x6 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1", | |
string = spurl[str_detect(string = spurl, pattern = "Habitat:")])} else{ | |
x6 = NA | |
} | |
list(sp_descrip = data.frame(sp = spname, status = x1, descrip = x2, flower = x3, | |
fruit = x4, leaf = x5, habitat = x6), | |
sp_synonyms = list(sp = spname, spcode = sp.url, synonyms = syn.names)) | |
} | |
uwsp.herbarium = "http://wisplants.uwsp.edu/Families.html" | |
# the page with a list of all families | |
families = readLines(uwsp.herbarium) | |
str(families) | |
families = families[str_detect(string = families, pattern = "(Familygenera.asp\\?Family=[A-Za-z]*)")] | |
families = str_extract(string = families, pattern = "Familygenera.asp\\?Family=[A-Za-z]+") | |
head(families); tail(families) | |
fam.url = paste("http://wisplants.uwsp.edu/scripts/", families, sep = "") | |
head(fam.url) | |
length(fam.url) | |
uwsp.herb.details = llply(fam.url, .progress = "text", function(f.url){ | |
# get genus url for each family | |
genus.page = readLines(f.url) | |
genus.name = na.omit(str_extract(string = genus.page, pattern = "SearchResults.asp\\?Genus=[0-9A-Za-z]+")) | |
genus.url = paste("http://wisplants.uwsp.edu/scripts/", genus.name, sep = "") | |
# get species url for each genus | |
llply(genus.url, function(g.url){ | |
sp.page = readLines(g.url) | |
sp.name = na.omit(str_extract(string = sp.page, pattern = "detail.asp\\?SpCode=[0-9A-Za-z]+")) | |
sp.url = paste("http://wisplants.uwsp.edu/scripts/", sp.name, sep = "") | |
# get details info for each species | |
llply(sp.url, function(s.url){ | |
get_sp_detail(s.url) | |
}) | |
}) | |
}) | |
# sp details in data frame | |
sp.details.uwsp = ldply(uwsp.herb.details, function(x){ | |
ldply(x, function(xx) ldply(xx, function(xxx) xxx$sp_descrip)) | |
}) | |
sp.synon.uwsp = ldply(uwsp.herb.details, function(x){ | |
ldply(x, function(xx) ldply(xx, function(xxx) { | |
yy = xxx$sp_synonyms | |
yyy = data.frame(syn = yy$synonyms) | |
yyy$sp = yy$sp | |
yyy$spcode = yy$spcode | |
select(yyy, sp, spcode, syn) | |
})) | |
}) | |
# clean results, since I do not know all formats of the info on webpage | |
sp.synon.uwsp$spcode = str_replace(string = sp.synon.uwsp$spcode, pattern = ".*=(.*)$", "\\1") | |
sp.synon.uwsp$sp = str_replace(sp.synon.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1") | |
unique(sp.synon.uwsp$sp) | |
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "\"-1\"")]=NA | |
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "^[)]$")]=NA | |
sp.synon.uwsp$syn = str_replace(string = sp.synon.uwsp$syn, | |
pattern = "^X (.*)$", | |
replacement = "\\1") | |
sp.synon.uwsp.simple = sp.synon.uwsp | |
sp.synon.uwsp.simple$sp = str_replace(string = sp.synon.uwsp.simple$sp, | |
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$", | |
replacement = "\\1") | |
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn, | |
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$", | |
replacement = "\\1") | |
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn, | |
pattern = "^X (.*)$", | |
replacement = "\\1") | |
sp.synon.uwsp.simple = unique(sp.synon.uwsp.simple) | |
sp.synon.uwsp.simple = arrange(sp.synon.uwsp.simple, sp) | |
# put sp name in syn, so easy to check sp names | |
sp.synon.uwsp.simple = ddply(sp.synon.uwsp.simple, .(sp), function(x){ | |
if(unique(x$sp) %in% x$syn) {x} else{ | |
y = data.frame(sp = unique(x$sp), spcode = NA, syn = unique(x$sp)) | |
rbind(x, y) | |
} | |
}) | |
sp.details.uwsp$sp = str_replace(sp.details.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1") | |
sp.details.uwsp$status = str_replace(sp.details.uwsp$status, | |
pattern = "^<B>.*>(.*)</FONT></B>.*$", | |
"\\1") | |
sp.details.uwsp$sp.simple = str_replace(sp.details.uwsp$sp, | |
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$", | |
"\\1") | |
names(sp.details.uwsp) | |
sp.details.uwsp = sp.details.uwsp[, c(1, 8, 2:7)] | |
for(i in 4:8){ | |
sp.details.uwsp[, i] = str_replace_all(sp.details.uwsp[,i], | |
pattern = "<b>|</b>", | |
"") | |
} | |
sp.details.uwsp$fruit = str_replace_all(sp.details.uwsp$fruit, | |
"<IMG.*>", | |
"") | |
# save results | |
write.table(x = sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.csv", | |
row.names=F, sep = "\t") | |
write.table(x = sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.csv", | |
row.names=F, sep = "\t") | |
saveRDS(sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.rds") | |
saveRDS(sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.rds") | |
# sp name check ---- | |
check_names = function(sp.to.check, sp.dataset = sp.synon.uwsp.simple){ | |
sp.dataset.list = unique(na.omit(sp.dataset$syn)) | |
#most close spelling in the dataset | |
sp.spell.checked = sapply(sp.to.check, function(x){ | |
# igonre xxx.spp | |
if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){ | |
x | |
} else{ | |
sp.dataset.list[which.min(adist(x, sp.dataset.list))]} | |
}) | |
#sp names used as main in webpages of UWSP herbarium | |
sp.syc = sapply(sp.spell.checked, function(x){ | |
if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){ | |
x | |
} else{ | |
y = unique(na.omit(sp.dataset$sp[sp.dataset$syn == x])) | |
if(length(y > 1)){ | |
y = y[which.min(adist(x, y))] | |
} | |
y} | |
}) | |
data.frame(sp.origin = sp.to.check, sp.spell = sp.spell.checked, | |
sp.final = sp.syc, | |
spell.correct = sp.to.check == sp.spell.checked, | |
syc.correct = sp.spell.checked == sp.syc) | |
} | |
``` | |
```r | |
test = check_names(mass_sla_mean$sp) | |
head(filter(test, spell.correct == FALSE | syc.correct == FALSE)) | |
``` | |
``` | |
## sp.origin sp.spell sp.final spell.correct syc.correct | |
## 1 Andropogon gerardi Andropogon gerardii Andropogon gerardii FALSE TRUE | |
## 2 Anemone canadenses Anemone canadensis Anemone canadensis FALSE TRUE | |
## 3 Aster azureus Aster azureus Aster oolentangiensis TRUE FALSE | |
## 4 Brickellia eupatorioides Brickellia eupatorioides Kuhnia eupatorioides TRUE FALSE | |
## 5 Cardamine diphilla Cardamine diphylla Cardamine diphylla FALSE TRUE | |
## 6 Carex eburna Carex eburnea Carex eburnea FALSE TRUE | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment