daijiang/sp_name_check

## sp_name_check
Working with datasets with inconsistent species names is annoying. For example, I am working on resurvey Pine Barrens in central Wisconsin that had been sampled in 1958. However, some species names have been changed overtime. Thus, it is necessary to sync them before data analysis. There is an excellent R package `[taxize](http://ropensci.org/tutorials/taxize_tutorial.html)` for this. However, it is global based thus sometimes can be overwhelming. And I just want to sync my species names with names from the Wisconsin State Herbarium.

Until today, I checked species names manually, i.e. search website one by one. This will be fine if I only have one dataset to deal with. Last week, I was trying to build a functional trait dataset from different sources, e.g. carbon and nitrogen before 2013 in one spreadsheet and from this year in another; leaf area and SLA in another; plant height and flower height in another, etc. In addition, it is no clear that whether these spreadsheet have consistent species names! Plus, it is almost unavoidable that you will find typos in these species names. What should I do? As I said, `taxize` is overwhelming... So I decide to write some code to search each species names to:

1. check for spelling, and correct it;
2. check for synonyms and use the latest name.


```r
# pkgs needed
library(stringr)
library(reshape2)
library(plyr)
library(dplyr)

# give a web url for one species, this function will get info and synonyms for this species.
get_sp_detail = function(sp.url){
  spurl = readLines(sp.url)
  # sp name and code
  spname = str_replace(pattern = ".*<B>([&()-. A-Za-z]*)</B>.*", replacement = "\\1",
                       string = spurl[grep(pattern = "Familygenera.asp\\?Family", x = spurl)[1]+1])

  # synonyms
  if(any(str_detect(string = spurl, pattern = "\"-1\""))){
    syn.names = str_replace(pattern = ".*>([&()-. A-Za-z]*)</FONT><BR>$", replacement = "\\1",
                            string = spurl[str_detect(string = spurl, pattern = "\"-1\"")])}
  else{syn.names = NA}
  # status

  if(any(str_detect(string = spurl, pattern = "Status:"))){
    x1 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Status:")])} else{
                       x1 = NA
                     }
  # plant descrip
  if(any(str_detect(string = spurl, pattern = "Plant:"))){
    x2 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Plant:")])} else(
                       x2 = NA)
  # Flower
  if(any(str_detect(string = spurl, pattern = "Flower:"))){
    x3 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Flower:")])} else{
                       x3 = NA
                     }
  # Fruit
  if(any(str_detect(string = spurl, pattern = "Fruit:"))){
    x4 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Fruit:")])} else{
                       x4 = NA
                     }
  # Leaf
  if(any(str_detect(string = spurl, pattern = "Leaf:"))){
    x5 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Leaf:")])} else{
                       x5 = NA
                     }
  # Habitat
  if(any(str_detect(string = spurl, pattern = "Habitat:"))){
    x6 = str_replace(pattern = ".*ARIAL, HELVETICA\">(.*)</FONT>.*$", replacement = "\\1",
                     string = spurl[str_detect(string = spurl, pattern = "Habitat:")])} else{
                       x6 = NA
                     }

  list(sp_descrip = data.frame(sp = spname, status = x1, descrip = x2, flower = x3,
                               fruit = x4, leaf = x5, habitat = x6),
       sp_synonyms = list(sp = spname, spcode = sp.url, synonyms = syn.names))
}


uwsp.herbarium = "http://wisplants.uwsp.edu/Families.html"
# the page with a list of all families
families = readLines(uwsp.herbarium)
str(families)
families = families[str_detect(string = families, pattern = "(Familygenera.asp\\?Family=[A-Za-z]*)")]
families = str_extract(string = families, pattern = "Familygenera.asp\\?Family=[A-Za-z]+")
head(families); tail(families)
fam.url = paste("http://wisplants.uwsp.edu/scripts/", families, sep = "")
head(fam.url)
length(fam.url)

uwsp.herb.details = llply(fam.url, .progress = "text", function(f.url){
  # get genus url for each family
  genus.page = readLines(f.url)
  genus.name = na.omit(str_extract(string = genus.page, pattern = "SearchResults.asp\\?Genus=[0-9A-Za-z]+"))
  genus.url = paste("http://wisplants.uwsp.edu/scripts/", genus.name, sep = "")
  # get species url for each genus
  llply(genus.url, function(g.url){
    sp.page = readLines(g.url)
    sp.name = na.omit(str_extract(string = sp.page, pattern = "detail.asp\\?SpCode=[0-9A-Za-z]+"))
    sp.url = paste("http://wisplants.uwsp.edu/scripts/", sp.name, sep = "")
    # get details info for each species
    llply(sp.url, function(s.url){
      get_sp_detail(s.url)
    })
  })
})

# sp details in data frame
sp.details.uwsp = ldply(uwsp.herb.details, function(x){
  ldply(x, function(xx) ldply(xx, function(xxx) xxx$sp_descrip))
})

sp.synon.uwsp = ldply(uwsp.herb.details, function(x){
  ldply(x, function(xx) ldply(xx, function(xxx) {
    yy = xxx$sp_synonyms
    yyy = data.frame(syn = yy$synonyms)
    yyy$sp = yy$sp
    yyy$spcode = yy$spcode
    select(yyy, sp, spcode, syn)
    }))
})

# clean results, since I do not know all formats of the info on webpage
sp.synon.uwsp$spcode = str_replace(string = sp.synon.uwsp$spcode, pattern = ".*=(.*)$", "\\1")
sp.synon.uwsp$sp = str_replace(sp.synon.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1")
unique(sp.synon.uwsp$sp)
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "\"-1\"")]=NA
sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "^[)]$")]=NA
sp.synon.uwsp$syn = str_replace(string = sp.synon.uwsp$syn,
                                       pattern = "^X (.*)$",
                                       replacement = "\\1")

sp.synon.uwsp.simple = sp.synon.uwsp
sp.synon.uwsp.simple$sp = str_replace(string = sp.synon.uwsp.simple$sp,
              pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
              replacement = "\\1")
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
                                      pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
                                      replacement = "\\1")
sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
                                       pattern = "^X (.*)$",
                                       replacement = "\\1")
sp.synon.uwsp.simple = unique(sp.synon.uwsp.simple)
sp.synon.uwsp.simple = arrange(sp.synon.uwsp.simple, sp)
# put sp name in syn, so easy to check sp names
sp.synon.uwsp.simple = ddply(sp.synon.uwsp.simple, .(sp), function(x){
  if(unique(x$sp) %in% x$syn) {x} else{
    y = data.frame(sp = unique(x$sp), spcode = NA, syn = unique(x$sp))
    rbind(x, y)
  }
})

sp.details.uwsp$sp = str_replace(sp.details.uwsp$sp, pattern = ".*<B>(.*)</B>.*$", "\\1")
sp.details.uwsp$status = str_replace(sp.details.uwsp$status,
                                     pattern = "^<B>.*>(.*)</FONT></B>.*$",
                                     "\\1")
sp.details.uwsp$sp.simple = str_replace(sp.details.uwsp$sp,
                                        pattern = "^([A-Za-z]* X? ?[-a-z]*) .*$",
                                        "\\1")
names(sp.details.uwsp)
sp.details.uwsp = sp.details.uwsp[, c(1, 8, 2:7)]
for(i in 4:8){
  sp.details.uwsp[, i] = str_replace_all(sp.details.uwsp[,i],
                                     pattern = "<b>|</b>",
                                     "")
}
sp.details.uwsp$fruit = str_replace_all(sp.details.uwsp$fruit,
                                        "<IMG.*>",
                                        "")

# save results
write.table(x = sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.csv",
            row.names=F, sep = "\t")
write.table(x = sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.csv",
            row.names=F, sep = "\t")
saveRDS(sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.rds")
saveRDS(sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.rds")

# sp name check ----
check_names = function(sp.to.check, sp.dataset = sp.synon.uwsp.simple){
  sp.dataset.list = unique(na.omit(sp.dataset$syn))
  #most close spelling in the dataset
  sp.spell.checked = sapply(sp.to.check, function(x){
    # igonre xxx.spp
    if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){
      x
    } else{
      sp.dataset.list[which.min(adist(x, sp.dataset.list))]}
  })
  #sp names used as main in webpages of UWSP herbarium
  sp.syc = sapply(sp.spell.checked, function(x){
    if(any(str_detect(pattern = ".*sp[0-9]?$|.*spp[0-9]?$|.*sp\\.$", string = x))){
      x
    } else{
      y = unique(na.omit(sp.dataset$sp[sp.dataset$syn == x]))
      if(length(y > 1)){
        y = y[which.min(adist(x, y))]
      }
      y}
  })
  data.frame(sp.origin = sp.to.check, sp.spell = sp.spell.checked,
             sp.final = sp.syc,
             spell.correct = sp.to.check == sp.spell.checked,
             syc.correct = sp.spell.checked == sp.syc)
}
```

```r
test = check_names(mass_sla_mean$sp)
head(filter(test, spell.correct == FALSE | syc.correct == FALSE))
```

```
##                       sp.origin                 sp.spell                 sp.final spell.correct syc.correct
## 1            Andropogon gerardi      Andropogon gerardii      Andropogon gerardii         FALSE        TRUE
## 2            Anemone canadenses       Anemone canadensis       Anemone canadensis         FALSE        TRUE
## 3                 Aster azureus            Aster azureus    Aster oolentangiensis          TRUE       FALSE
## 4      Brickellia eupatorioides Brickellia eupatorioides     Kuhnia eupatorioides          TRUE       FALSE
## 5            Cardamine diphilla       Cardamine diphylla       Cardamine diphylla         FALSE        TRUE
## 6                  Carex eburna            Carex eburnea            Carex eburnea         FALSE        TRUE
```
	Working with datasets with inconsistent species names is annoying. For example, I am working on resurvey Pine Barrens in central Wisconsin that had been sampled in 1958. However, some species names have been changed overtime. Thus, it is necessary to sync them before data analysis. There is an excellent R package `[taxize](http://ropensci.org/tutorials/taxize_tutorial.html)` for this. However, it is global based thus sometimes can be overwhelming. And I just want to sync my species names with names from the Wisconsin State Herbarium.

	Until today, I checked species names manually, i.e. search website one by one. This will be fine if I only have one dataset to deal with. Last week, I was trying to build a functional trait dataset from different sources, e.g. carbon and nitrogen before 2013 in one spreadsheet and from this year in another; leaf area and SLA in another; plant height and flower height in another, etc. In addition, it is no clear that whether these spreadsheet have consistent species names! Plus, it is almost unavoidable that you will find typos in these species names. What should I do? As I said, `taxize` is overwhelming... So I decide to write some code to search each species names to:

	1. check for spelling, and correct it;
	2. check for synonyms and use the latest name.


	```r
	# pkgs needed
	library(stringr)
	library(reshape2)
	library(plyr)
	library(dplyr)

	# give a web url for one species, this function will get info and synonyms for this species.
	get_sp_detail = function(sp.url){
	spurl = readLines(sp.url)
	# sp name and code
	spname = str_replace(pattern = ".<B>([&()-. A-Za-z])</B>.*", replacement = "\\1",
	string = spurl[grep(pattern = "Familygenera.asp\\?Family", x = spurl)[1]+1])

	# synonyms
	if(any(str_detect(string = spurl, pattern = "\"-1\""))){
	syn.names = str_replace(pattern = ".>([&()-. A-Za-z])</FONT><BR>$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "\"-1\"")])}
	else{syn.names = NA}
	# status

	if(any(str_detect(string = spurl, pattern = "Status:"))){
	x1 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Status:")])} else{
	x1 = NA
	}
	# plant descrip
	if(any(str_detect(string = spurl, pattern = "Plant:"))){
	x2 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Plant:")])} else(
	x2 = NA)
	# Flower
	if(any(str_detect(string = spurl, pattern = "Flower:"))){
	x3 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Flower:")])} else{
	x3 = NA
	}
	# Fruit
	if(any(str_detect(string = spurl, pattern = "Fruit:"))){
	x4 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Fruit:")])} else{
	x4 = NA
	}
	# Leaf
	if(any(str_detect(string = spurl, pattern = "Leaf:"))){
	x5 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Leaf:")])} else{
	x5 = NA
	}
	# Habitat
	if(any(str_detect(string = spurl, pattern = "Habitat:"))){
	x6 = str_replace(pattern = ".ARIAL, HELVETICA\">(.)</FONT>.*$", replacement = "\\1",
	string = spurl[str_detect(string = spurl, pattern = "Habitat:")])} else{
	x6 = NA
	}

	list(sp_descrip = data.frame(sp = spname, status = x1, descrip = x2, flower = x3,
	fruit = x4, leaf = x5, habitat = x6),
	sp_synonyms = list(sp = spname, spcode = sp.url, synonyms = syn.names))
	}


	uwsp.herbarium = "http://wisplants.uwsp.edu/Families.html"
	# the page with a list of all families
	families = readLines(uwsp.herbarium)
	str(families)
	families = families[str_detect(string = families, pattern = "(Familygenera.asp\\?Family=[A-Za-z]*)")]
	families = str_extract(string = families, pattern = "Familygenera.asp\\?Family=[A-Za-z]+")
	head(families); tail(families)
	fam.url = paste("http://wisplants.uwsp.edu/scripts/", families, sep = "")
	head(fam.url)
	length(fam.url)

	uwsp.herb.details = llply(fam.url, .progress = "text", function(f.url){
	# get genus url for each family
	genus.page = readLines(f.url)
	genus.name = na.omit(str_extract(string = genus.page, pattern = "SearchResults.asp\\?Genus=[0-9A-Za-z]+"))
	genus.url = paste("http://wisplants.uwsp.edu/scripts/", genus.name, sep = "")
	# get species url for each genus
	llply(genus.url, function(g.url){
	sp.page = readLines(g.url)
	sp.name = na.omit(str_extract(string = sp.page, pattern = "detail.asp\\?SpCode=[0-9A-Za-z]+"))
	sp.url = paste("http://wisplants.uwsp.edu/scripts/", sp.name, sep = "")
	# get details info for each species
	llply(sp.url, function(s.url){
	get_sp_detail(s.url)
	})
	})
	})

	# sp details in data frame
	sp.details.uwsp = ldply(uwsp.herb.details, function(x){
	ldply(x, function(xx) ldply(xx, function(xxx) xxx$sp_descrip))
	})

	sp.synon.uwsp = ldply(uwsp.herb.details, function(x){
	ldply(x, function(xx) ldply(xx, function(xxx) {
	yy = xxx$sp_synonyms
	yyy = data.frame(syn = yy$synonyms)
	yyy$sp = yy$sp
	yyy$spcode = yy$spcode
	select(yyy, sp, spcode, syn)
	}))
	})

	# clean results, since I do not know all formats of the info on webpage
	sp.synon.uwsp$spcode = str_replace(string = sp.synon.uwsp$spcode, pattern = ".=(.)$", "\\1")
	sp.synon.uwsp$sp = str_replace(sp.synon.uwsp$sp, pattern = ".<B>(.)</B>.*$", "\\1")
	unique(sp.synon.uwsp$sp)
	sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "\"-1\"")]=NA
	sp.synon.uwsp$syn[str_detect(sp.synon.uwsp$syn, pattern = "^[)]$")]=NA
	sp.synon.uwsp$syn = str_replace(string = sp.synon.uwsp$syn,
	pattern = "^X (.*)$",
	replacement = "\\1")

	sp.synon.uwsp.simple = sp.synon.uwsp
	sp.synon.uwsp.simple$sp = str_replace(string = sp.synon.uwsp.simple$sp,
	pattern = "^([A-Za-z]* X? ?[-a-z]) .$",
	replacement = "\\1")
	sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
	pattern = "^([A-Za-z]* X? ?[-a-z]) .$",
	replacement = "\\1")
	sp.synon.uwsp.simple$syn = str_replace(string = sp.synon.uwsp.simple$syn,
	pattern = "^X (.*)$",
	replacement = "\\1")
	sp.synon.uwsp.simple = unique(sp.synon.uwsp.simple)
	sp.synon.uwsp.simple = arrange(sp.synon.uwsp.simple, sp)
	# put sp name in syn, so easy to check sp names
	sp.synon.uwsp.simple = ddply(sp.synon.uwsp.simple, .(sp), function(x){
	if(unique(x$sp) %in% x$syn) {x} else{
	y = data.frame(sp = unique(x$sp), spcode = NA, syn = unique(x$sp))
	rbind(x, y)
	}
	})

	sp.details.uwsp$sp = str_replace(sp.details.uwsp$sp, pattern = ".<B>(.)</B>.*$", "\\1")
	sp.details.uwsp$status = str_replace(sp.details.uwsp$status,
	pattern = "^<B>.>(.)</FONT></B>.*$",
	"\\1")
	sp.details.uwsp$sp.simple = str_replace(sp.details.uwsp$sp,
	pattern = "^([A-Za-z]* X? ?[-a-z]) .$",
	"\\1")
	names(sp.details.uwsp)
	sp.details.uwsp = sp.details.uwsp[, c(1, 8, 2:7)]
	for(i in 4:8){
	sp.details.uwsp[, i] = str_replace_all(sp.details.uwsp[,i],
	pattern = "<b>\|</b>",
	"")
	}
	sp.details.uwsp$fruit = str_replace_all(sp.details.uwsp$fruit,
	"<IMG.*>",
	"")

	# save results
	write.table(x = sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.csv",
	row.names=F, sep = "\t")
	write.table(x = sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.csv",
	row.names=F, sep = "\t")
	saveRDS(sp.details.uwsp, file = "raw_data/trait/sp_detail_uwsp.rds")
	saveRDS(sp.synon.uwsp.simple, file = "raw_data/trait/sp_syn_uwsp.rds")

	# sp name check ----
	check_names = function(sp.to.check, sp.dataset = sp.synon.uwsp.simple){
	sp.dataset.list = unique(na.omit(sp.dataset$syn))
	#most close spelling in the dataset
	sp.spell.checked = sapply(sp.to.check, function(x){
	# igonre xxx.spp
	if(any(str_detect(pattern = ".sp[0-9]?$\|.spp[0-9]?$\|.*sp\\.$", string = x))){
	x
	} else{
	sp.dataset.list[which.min(adist(x, sp.dataset.list))]}
	})
	#sp names used as main in webpages of UWSP herbarium
	sp.syc = sapply(sp.spell.checked, function(x){
	if(any(str_detect(pattern = ".sp[0-9]?$\|.spp[0-9]?$\|.*sp\\.$", string = x))){
	x
	} else{
	y = unique(na.omit(sp.dataset$sp[sp.dataset$syn == x]))
	if(length(y > 1)){
	y = y[which.min(adist(x, y))]
	}
	y}
	})
	data.frame(sp.origin = sp.to.check, sp.spell = sp.spell.checked,
	sp.final = sp.syc,
	spell.correct = sp.to.check == sp.spell.checked,
	syc.correct = sp.spell.checked == sp.syc)
	}
	```

	```r
	test = check_names(mass_sla_mean$sp)
	head(filter(test, spell.correct == FALSE \| syc.correct == FALSE))
	```

	```
	## sp.origin sp.spell sp.final spell.correct syc.correct
	## 1 Andropogon gerardi Andropogon gerardii Andropogon gerardii FALSE TRUE
	## 2 Anemone canadenses Anemone canadensis Anemone canadensis FALSE TRUE
	## 3 Aster azureus Aster azureus Aster oolentangiensis TRUE FALSE
	## 4 Brickellia eupatorioides Brickellia eupatorioides Kuhnia eupatorioides TRUE FALSE
	## 5 Cardamine diphilla Cardamine diphylla Cardamine diphylla FALSE TRUE
	## 6 Carex eburna Carex eburnea Carex eburnea FALSE TRUE
	```