Skip to content

Instantly share code, notes, and snippets.

@khufkens
Created October 18, 2017 12:42
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save khufkens/72737910eddb6399ede7eeee0465edd7 to your computer and use it in GitHub Desktop.
Scrape Tropicos for species distribution information
#' @param species: genus species or genus
#' @param quiet: TRUE / FALSE provides verbose output
#' @keywords Tropicos, species distribution
#' @examples
#'
#' # with defaults, outputting a data frame with species distribution
#' # for Clematis
#' df <- tropicos.species.distribution()
#' # returns NA if no data are present
#' [requires the rvest package for post-processing]
tropicos_species_distribution <- function(species='Clematis',quiet=TRUE){
# read the required libraries
if(!require(rvest)){
stop("rvest package is not available, please install 'rvest'!")
}
species = gsub(" ", "+", species)
base_url = "http://www.tropicos.org/NameSearch.aspx?name="
url = sprintf("%s%s",base_url,species)
# load the page into memory
html_page = read_html(url)
# first check if the page lists a table of species
# or was redirected to the only search result presented
check = html_page %>%
html_nodes("#ctl00_footerControl_citationDiv") %>%
html_text()
sel = regexpr("(http://).*[0-9]",check)
final_url = regmatches(check,sel)
# if the length of the final_url is 1 then use the final_url
# to move on, otherwise select the first listed species
# and move to this page to list the distribution
if(length(final_url)>0){
url = sprintf("%s%s",final_url,"?tab=distribution")
html_page = read_html(url)
distribution = html_page %>%
html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
html_table() %>%
data.frame()
}else{
# if multiple search results are present, pick the top of the list
# and follow this link to the distribution data if available
# load the first species in the table generated by the species search
first_link_listed = html_page %>%
html_nodes("#ctl00_MainContentPlaceHolder_nameSearchControl_gridView tr:nth-child(2) td:nth-child(3)") %>%
html_nodes("a") %>%
html_attr("href")
# if empty return NULL
if (length(first_link_listed)==0){
distribution = NULL
}else{
# clean up the string, grab the species indentifier number
first_link_listed = gsub("[/Name/]", "", first_link_listed)
# format the new string, linking to the species distribution
url = sprintf("http://www.tropicos.org/Name/%s%s",first_link_listed,"?tab=distribution")
html_page = read_html(url)
# extract the species distribution in full from the page
distribution = html_page %>%
html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
html_table() %>%
data.frame()
}
}
# if empty return NULL
if(prod(dim(distribution))==0){
distribution = NULL
}
# return the data frame, verbose or not
if (quiet == FALSE){
print(distribution)
}
return(distribution)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment