Created
October 18, 2017 12:42
Star
You must be signed in to star a gist
Scrape Tropicos for species distribution information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @param species: genus species or genus | |
#' @param quiet: TRUE / FALSE provides verbose output | |
#' @keywords Tropicos, species distribution | |
#' @examples | |
#' | |
#' # with defaults, outputting a data frame with species distribution | |
#' # for Clematis | |
#' df <- tropicos.species.distribution() | |
#' # returns NA if no data are present | |
#' [requires the rvest package for post-processing] | |
tropicos_species_distribution <- function(species='Clematis',quiet=TRUE){ | |
# read the required libraries | |
if(!require(rvest)){ | |
stop("rvest package is not available, please install 'rvest'!") | |
} | |
species = gsub(" ", "+", species) | |
base_url = "http://www.tropicos.org/NameSearch.aspx?name=" | |
url = sprintf("%s%s",base_url,species) | |
# load the page into memory | |
html_page = read_html(url) | |
# first check if the page lists a table of species | |
# or was redirected to the only search result presented | |
check = html_page %>% | |
html_nodes("#ctl00_footerControl_citationDiv") %>% | |
html_text() | |
sel = regexpr("(http://).*[0-9]",check) | |
final_url = regmatches(check,sel) | |
# if the length of the final_url is 1 then use the final_url | |
# to move on, otherwise select the first listed species | |
# and move to this page to list the distribution | |
if(length(final_url)>0){ | |
url = sprintf("%s%s",final_url,"?tab=distribution") | |
html_page = read_html(url) | |
distribution = html_page %>% | |
html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>% | |
html_table() %>% | |
data.frame() | |
}else{ | |
# if multiple search results are present, pick the top of the list | |
# and follow this link to the distribution data if available | |
# load the first species in the table generated by the species search | |
first_link_listed = html_page %>% | |
html_nodes("#ctl00_MainContentPlaceHolder_nameSearchControl_gridView tr:nth-child(2) td:nth-child(3)") %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
# if empty return NULL | |
if (length(first_link_listed)==0){ | |
distribution = NULL | |
}else{ | |
# clean up the string, grab the species indentifier number | |
first_link_listed = gsub("[/Name/]", "", first_link_listed) | |
# format the new string, linking to the species distribution | |
url = sprintf("http://www.tropicos.org/Name/%s%s",first_link_listed,"?tab=distribution") | |
html_page = read_html(url) | |
# extract the species distribution in full from the page | |
distribution = html_page %>% | |
html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>% | |
html_table() %>% | |
data.frame() | |
} | |
} | |
# if empty return NULL | |
if(prod(dim(distribution))==0){ | |
distribution = NULL | |
} | |
# return the data frame, verbose or not | |
if (quiet == FALSE){ | |
print(distribution) | |
} | |
return(distribution) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment