khufkens/tropicos_species_distribution.r

## tropicos_species_distribution.r
#' @param species: genus species or genus
#' @param quiet: TRUE / FALSE provides verbose output
#' @keywords Tropicos, species distribution
#' @examples
#'
#' # with defaults, outputting a data frame with species distribution
#' # for Clematis
#' df <- tropicos.species.distribution()
#' # returns NA if no data are present
#' [requires the rvest package for post-processing]

tropicos_species_distribution <- function(species='Clematis',quiet=TRUE){

  # read the required libraries
  if(!require(rvest)){
    stop("rvest package is not available, please install 'rvest'!")
  }

  species = gsub(" ", "+", species)
  base_url = "http://www.tropicos.org/NameSearch.aspx?name="
  url = sprintf("%s%s",base_url,species)

  # load the page into memory
  html_page = read_html(url)

  # first check if the page lists a table of species
  # or was redirected to the only search result presented
  check = html_page %>%
    html_nodes("#ctl00_footerControl_citationDiv") %>%
    html_text()

  sel = regexpr("(http://).*[0-9]",check)
  final_url = regmatches(check,sel)

  # if the length of the final_url is 1 then use the final_url
  # to move on, otherwise select the first listed species
  # and move to this page to list the distribution
  if(length(final_url)>0){

    url = sprintf("%s%s",final_url,"?tab=distribution")
    html_page = read_html(url)

    distribution = html_page %>%
      html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
      html_table() %>%
      data.frame()

  }else{

    # if multiple search results are present, pick the top of the list
    # and follow this link to the distribution data if available
    # load the first species in the table generated by the species search
    first_link_listed = html_page %>%
      html_nodes("#ctl00_MainContentPlaceHolder_nameSearchControl_gridView tr:nth-child(2) td:nth-child(3)") %>%
      html_nodes("a") %>%
      html_attr("href")

    # if empty return NULL
    if (length(first_link_listed)==0){
      distribution = NULL
    }else{

    # clean up the string, grab the species indentifier number
    first_link_listed = gsub("[/Name/]", "", first_link_listed)

    # format the new string, linking to the species distribution
    url = sprintf("http://www.tropicos.org/Name/%s%s",first_link_listed,"?tab=distribution")
    html_page = read_html(url)

    # extract the species distribution in full from the page
    distribution = html_page %>%
      html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
      html_table() %>%
      data.frame()
    }
  }

  # if empty return NULL
  if(prod(dim(distribution))==0){
    distribution = NULL
  }

  # return the data frame, verbose or not
  if (quiet == FALSE){
    print(distribution)
  }
  return(distribution)
}
	#' @param species: genus species or genus
	#' @param quiet: TRUE / FALSE provides verbose output
	#' @keywords Tropicos, species distribution
	#' @examples
	#'
	#' # with defaults, outputting a data frame with species distribution
	#' # for Clematis
	#' df <- tropicos.species.distribution()
	#' # returns NA if no data are present
	#' [requires the rvest package for post-processing]

	tropicos_species_distribution <- function(species='Clematis',quiet=TRUE){

	# read the required libraries
	if(!require(rvest)){
	stop("rvest package is not available, please install 'rvest'!")
	}

	species = gsub(" ", "+", species)
	base_url = "http://www.tropicos.org/NameSearch.aspx?name="
	url = sprintf("%s%s",base_url,species)

	# load the page into memory
	html_page = read_html(url)

	# first check if the page lists a table of species
	# or was redirected to the only search result presented
	check = html_page %>%
	html_nodes("#ctl00_footerControl_citationDiv") %>%
	html_text()

	sel = regexpr("(http://).*[0-9]",check)
	final_url = regmatches(check,sel)

	# if the length of the final_url is 1 then use the final_url
	# to move on, otherwise select the first listed species
	# and move to this page to list the distribution
	if(length(final_url)>0){

	url = sprintf("%s%s",final_url,"?tab=distribution")
	html_page = read_html(url)

	distribution = html_page %>%
	html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
	html_table() %>%
	data.frame()

	}else{

	# if multiple search results are present, pick the top of the list
	# and follow this link to the distribution data if available
	# load the first species in the table generated by the species search
	first_link_listed = html_page %>%
	html_nodes("#ctl00_MainContentPlaceHolder_nameSearchControl_gridView tr:nth-child(2) td:nth-child(3)") %>%
	html_nodes("a") %>%
	html_attr("href")

	# if empty return NULL
	if (length(first_link_listed)==0){
	distribution = NULL
	}else{

	# clean up the string, grab the species indentifier number
	first_link_listed = gsub("[/Name/]", "", first_link_listed)

	# format the new string, linking to the species distribution
	url = sprintf("http://www.tropicos.org/Name/%s%s",first_link_listed,"?tab=distribution")
	html_page = read_html(url)

	# extract the species distribution in full from the page
	distribution = html_page %>%
	html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
	html_table() %>%
	data.frame()
	}
	}

	# if empty return NULL
	if(prod(dim(distribution))==0){
	distribution = NULL
	}

	# return the data frame, verbose or not
	if (quiet == FALSE){
	print(distribution)
	}
	return(distribution)
	}