Skip to content

Instantly share code, notes, and snippets.

@daijiang
Created December 30, 2014 04:42
Show Gist options
  • Save daijiang/640e493015a1e1b41e32 to your computer and use it in GitHub Desktop.
Save daijiang/640e493015a1e1b41e32 to your computer and use it in GitHub Desktop.
Codes in this file can be used to download species informations from the [Herbarium of UWSP](http://wisplants.uwsp.edu/index.html) and check your species names.
Working with datasets with inconsistent species names is annoying. For example, I am working on resurvey Pine Barrens in central Wisconsin that had been sampled in 1958. However, some species names have been changed overtime. Thus, it is necessary to sync them before data analysis. There is an excellent R package `[taxize](http://ropensci.org/tutorials/taxize_tutorial.html)` for this. However, it is global based thus sometimes can be overwhelming. And I just want to sync my species names with names from the Wisconsin State Herbarium.
Until today, I checked species names manually, i.e. search website one by one. This will be fine if I only have one dataset to deal with. Last week, I was trying to build a functional trait dataset from different sources, e.g. carbon and nitrogen before 2013 in one spreadsheet and from this year in another; leaf area and SLA in another; plant height and flower height in another, etc. In addition, it is no clear that whether these spreadsheet have consistent species names! Plus, it is almost unavoidable that you will find typos in these species names. What should I do? As I said, `taxize` is overwhelming... So I decide to write some code to search each species names to:
1. check for spelling, and correct it;
2. check for synonyms and use the latest name.
```r
# pkgs needed
library(stringr)
library(reshape2)
library(plyr)
library(dplyr)
# give a web url for one species, this function will get info and synonyms for this species.
get_sp_detail = function(sp.url){
spurl = readLines(sp.url)
# sp name and code
spname = str_replace(pattern = ".*<B>([&()-. A-Za-z]*)</B>.*", replacement = "\\1",
string = spurl[grep(pattern = "Familygenera.asp\\?Family", x = spurl)[1]+1])
# synonyms
if(any(str_detect(string = spurl, pattern = "\"-1\""))){
syn.names = str_replace(pattern = ".*>([&()-. A-Za-z]*)</FONT><BR>$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "\"-1\"")])}
else{syn.names = NA}
# status
if(any(str_detect(string = spurl, pattern = "Status:"))){
x1 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Status:")])} else{
x1 = NA
}
# plant descrip
if(any(str_detect(string = spurl, pattern = "Plant:"))){
x2 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Plant:")])} else(
x2 = NA)
# Flower
if(any(str_detect(string = spurl, pattern = "Flower:"))){
x3 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Flower:")])} else{
x3 = NA
}
# Fruit
if(any(str_detect(string = spurl, pattern = "Fruit:"))){
x4 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Fruit:")])} else{
x4 = NA
}
# Leaf
if(any(str_detect(string = spurl, pattern = "Leaf:"))){
x5 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Leaf:")])} else{
x5 = NA
}
# Habitat
if(any(str_detect(string = spurl, pattern = "Habitat:"))){
x6 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
string = spurl[str_detect(string = spurl, pattern = "Habitat:")])} else{
x6 = NA
}
list(sp_descrip = data.frame(sp = spname, status = x1, descrip = x2, flower = x3,
fruit = x4, leaf = x5, habitat = x6),
sp_synonyms = list(sp = spname, spcode = sp.url, synonyms = syn.names))
}
uwsp.herbarium = "http://wisplants.uwsp.edu/Families.html"
# the page with a list of all families
families = readLines(uwsp.herbarium)
str(families)
families = families[str_detect(string = families, pattern = "(Familygenera.asp\\?Family=[A-Za-z]*)")]
families = str_extract(string = families, pattern = "Familygenera.asp\\?Family=[A-Za-z]+")
head(families); tail(families)
fam.url = paste("http://wisplants.uwsp.edu/scripts/", families, sep = "")
head(fam.url)
length(fam.url)
uwsp.herb.details = llply(fam.url, .progress = "text", function(f.url){
# get genus url for each family
genus.page = readLines(f.url)
genus.name = na.omit(str_extract(string = genus.page, pattern = "SearchResults.asp\\?Genus=[0-9A-Za-z]+"))
genus.url = paste("http://wisplants.uwsp.edu/scripts/", genus.name, sep = "")
# get species url for each genus
llply(genus.url, function(g.url){
sp.page = readLines(g.url)
sp.name = na.omit(str_extract(string = sp.page, pattern = "detail.asp\\?SpCode=[0-9A-Za-z]+"))
sp.url = paste("http://wisplants.uwsp.edu/scripts/", sp.name, sep = "")
# get details info for each species
llply(sp.url, function(s.url){
get_sp_detail(s.url)
})
})
})
# sp details in data frame
sp.details.uwsp = ldply(uwsp.herb.details, function(x){
ldply(x, function(xx) ldply(xx, function(xxx) xxx$sp_descrip))
})
sp.synon.uwsp = ldply(uwsp.herb.details, function(x){
ldply(x, function(xx) ldply(xx, function(xxx) {
yy = xxx$sp_synonyms
yyy = data.frame(syn = yy$synonyms)
yyy$sp = yy$sp
yyy$spcode = yy$spcode
select(yyy, sp, spcode, syn)
}))
})
# clean results, since I do not know all formats of the info on webpage
sp.synon.uwsp$spcode = str_replace(string = sp.synon.uwsp$spcode, pattern = ".*=(.*)$", "\\1")
sp.synon.uwsp$sp = str_replace(sp.synon.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1")
unique(sp.synon.uwsp$sp)
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "\"-1\"")]=NA
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "^[)]$")]=NA
sp.synon.uwsp$syn = str_replace(string = sp.synon.uwsp$syn,
pattern = "^X (.*)$",
replacement = "\\1")
sp.synon.uwsp.simple = sp.synon.uwsp
sp.synon.uwsp.simple$sp = str_replace(string = sp.synon.uwsp.simple$sp,
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
replacement = "\\1")
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
replacement = "\\1")
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
pattern = "^X (.*)$",
replacement = "\\1")
sp.synon.uwsp.simple = unique(sp.synon.uwsp.simple)
sp.synon.uwsp.simple = arrange(sp.synon.uwsp.simple, sp)
# put sp name in syn, so easy to check sp names
sp.synon.uwsp.simple = ddply(sp.synon.uwsp.simple, .(sp), function(x){
if(unique(x$sp) %in% x$syn) {x} else{
y = data.frame(sp = unique(x$sp), spcode = NA, syn = unique(x$sp))
rbind(x, y)
}
})
sp.details.uwsp$sp = str_replace(sp.details.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1")
sp.details.uwsp$status = str_replace(sp.details.uwsp$status,
pattern = "^<B>.*>(.*)</FONT></B>.*$",
"\\1")
sp.details.uwsp$sp.simple = str_replace(sp.details.uwsp$sp,
pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
"\\1")
names(sp.details.uwsp)
sp.details.uwsp = sp.details.uwsp[, c(1, 8, 2:7)]
for(i in 4:8){
sp.details.uwsp[, i] = str_replace_all(sp.details.uwsp[,i],
pattern = "<b>|</b>",
"")
}
sp.details.uwsp$fruit = str_replace_all(sp.details.uwsp$fruit,
"<IMG.*>",
"")
# save results
write.table(x = sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.csv",
row.names=F, sep = "\t")
write.table(x = sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.csv",
row.names=F, sep = "\t")
saveRDS(sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.rds")
saveRDS(sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.rds")
# sp name check ----
check_names = function(sp.to.check, sp.dataset = sp.synon.uwsp.simple){
sp.dataset.list = unique(na.omit(sp.dataset$syn))
#most close spelling in the dataset
sp.spell.checked = sapply(sp.to.check, function(x){
# igonre xxx.spp
if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){
x
} else{
sp.dataset.list[which.min(adist(x, sp.dataset.list))]}
})
#sp names used as main in webpages of UWSP herbarium
sp.syc = sapply(sp.spell.checked, function(x){
if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){
x
} else{
y = unique(na.omit(sp.dataset$sp[sp.dataset$syn == x]))
if(length(y > 1)){
y = y[which.min(adist(x, y))]
}
y}
})
data.frame(sp.origin = sp.to.check, sp.spell = sp.spell.checked,
sp.final = sp.syc,
spell.correct = sp.to.check == sp.spell.checked,
syc.correct = sp.spell.checked == sp.syc)
}
```
```r
test = check_names(mass_sla_mean$sp)
head(filter(test, spell.correct == FALSE | syc.correct == FALSE))
```
```
## sp.origin sp.spell sp.final spell.correct syc.correct
## 1 Andropogon gerardi Andropogon gerardii Andropogon gerardii FALSE TRUE
## 2 Anemone canadenses Anemone canadensis Anemone canadensis FALSE TRUE
## 3 Aster azureus Aster azureus Aster oolentangiensis TRUE FALSE
## 4 Brickellia eupatorioides Brickellia eupatorioides Kuhnia eupatorioides TRUE FALSE
## 5 Cardamine diphilla Cardamine diphylla Cardamine diphylla FALSE TRUE
## 6 Carex eburna Carex eburnea Carex eburnea FALSE TRUE
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment