valentinitnelav/Read_from_URL_lake.R

## Read_from_URL_lake.R
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Script to read list of lake links and parse through them
# for Leana
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
library(XML)
library(stringr)
library(data.table)

#####################################################
#####################################################
# Define helper functions
#####################################################
#####################################################

# ===================================================
# Function to extract between given edging characters
# ===================================================
# my_txt <- nodes[idx]
# left <- "Pond Name: "
# right <- ""
extract_between <- function(my_txt, left, right){
    # Function to extract characters between given characters
    # __ Argumets:
    # my_txt = Character variable / line of text from where to extract
    # left   = characters - left edge to extract
    # right  = characters - right edge to extract;
    # note: use right = "" to flag that there is no right part
    # __ Return value:
    # returns character

    # if the riht part is "" (nothing)
    if (right=="")
    {
        # split the text in two, take only the last (2nd) part
        split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
        return(split_1_2nd_part)
    } else
    {
        # split the text in two, take only the last (2nd) part
        split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
        # split the 2nd part from above in two and take only the first part
        split_2_1st_part <- strsplit(x = split_1_2nd_part, split = right)[[1]][1]
        return(split_2_1st_part)
    }
}


# ===================================================
# Function to extract values from each given link
# ===================================================

get_from_html <- function(link)
    # Function to read & extract values from a given html link
    # __ Argumets:
    # link = a html link
    # example
    # link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=040837&pname=BALSAM%20LAKE"
    # __ Return value:
    # returns a data frame with extracted values
    # some rows are recycled (repeated), like:
    # Pond Name, Town, County, Latitude, Longitude, Elevation, Surface Area, Volume,  Max Depth, Mean Depth
    # Note: warnings like "row names were found from a short variable and have been discarded"
    # should be neglected.
{
    # ---------------------------------------------------
    # Read data from HTML structure - node identification
    # ---------------------------------------------------

    # parse the html document
    thepage_html <- htmlParse(link)
    # extract and simplify information; XPath expression "//li" is used to identify nodes
    # chaning the XPath expression can identify nodes in different ways
    nodes <- sapply(getNodeSet(doc = thepage_html, path = "//li"), xmlValue)
    # sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)

    search_what <- c("Pond Name", "Town", "County",
                     "Latitude", "Longitude", "Elevation",
                     "Surface Area", "Volume", "Max Depth", "Mean Depth")
    extr_1 <- sapply(search_what,
                     function(what)
                     {
                         idx <- grep(pattern = what, x = nodes, fixed = TRUE)
                         extract_between(my_txt = nodes[idx], left = ': ', right = "")
                     })

    extr_1 <- as.data.frame(extr_1)

    # NOTE - spliting nodes with //div doesn't help further
    # nodes <- sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)
    # search_what <- c("Name: ", "Date: ", "# caught: ")
    # extr_2 <- sapply(search_what,
    #                  function(what)
    #                  {
    #                      idx <- grep(pattern = what, x = nodes, fixed = TRUE)
    #                      extract_between(my_txt = nodes[idx], left = ': ', right = "")
    #                  })

    # ---------------------------------------------------
    # Read data from HTML structure - as plain text
    # ---------------------------------------------------

    # reads as a string (each HTML line appears as an element within a character vector)
    thepage <- readLines(link)
    # write.table(thepage, "file_example.txt")

    # ___ reads dates
    # find the indices of the lines that contain the pattern "id=\"historic_report_fish_date\">Date:"
    idx.dates <- grep(pattern = "id=\"historic_report_fish_date\">Date:",  x = thepage, fixed = TRUE)
    # from each line extract the date format;
    # lapply is used to iterate through the lines
    dates <- lapply(thepage[idx.dates],
                    function(x)
                        str_extract(string = x,
                                    pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}"))

    # ___ read fish names
    # find the indices of the lines that contain the pattern "id=\"historic_report_fish_species\">Name:"
    idx.names <- grep(pattern = "id=\"historic_report_fish_species\">Name:", x = thepage, fixed = TRUE)
    # now extract the names that appear between the pattern ">Name: " and "</div>"
    names <- lapply(thepage[idx.names],
                    function(row)
                        extract_between(my_txt = row, left = '>Name: ', right = "</div>"))

    # ___ read caughts info
    # find the indices of the lines that contain the pattern "id=\"historic_report_fish_nocaught\"># caught:"
    idx.caughts <- grep(pattern = "id=\"historic_report_fish_nocaught\"># caught:", x = thepage, fixed = TRUE)
    caughts <- lapply(thepage[idx.caughts], function(x) str_extract(string = x, pattern = "[[:digit:]]{1,100}"))


    # ---------------------------------------------------
    # put extracted data in a data frame
    # ---------------------------------------------------

    ALS_Sampling <- data.frame(Dates   = unlist(dates),
                               Names   = unlist(names),
                               Caughts = unlist(caughts))

    extraction <- cbind(t(extr_1), ALS_Sampling)

    return(extraction)
}

#####################################################
#####################################################
# Read from links
#####################################################
#####################################################

# read the list of links as data frame
links <- read.table("links.txt")

# it takes like 25-30 min to read through all pages
system.time( links_extract <- apply(links, MARGIN = 1, FUN = get_from_html) )

# bind all data frames from the list into a big data table
links_extract_dt <- data.table::rbindlist(links_extract)

# transform all columns from factor to character
str(links_extract_dt) # check structure before conversion
links_extract_dt[, names(links_extract_dt) := lapply(.SD, as.character)]
str(links_extract_dt) # check structure after conversion

#####################################################
#####################################################
# Transform latitude and longitute from DMS format
# to decimal format and save results
#####################################################
#####################################################

# ===================================================
# Function to convert coordinates from DMS or DdM
# formats to decimal format
# ===================================================
# DMS = "degrees minutes seconds"; DdM = "degrees decimal minutes"
# more on my blog:
# https://seethedatablog.wordpress.com/2016/10/30/r-converting-coordinates-from-dms-format-to-decimal/#more-837
dg2dec <- function(varb, Dg=NA, Min=NA, Sec=NA, SW.Hemisphere="S|W") {
    # Dg=decimal, Min=minutes and Sec=seconds;
    # NOTE 1 - if the format is "degrees decimal minutes - DdM" (e.g. 40° 26.767′ N) and not
    # "degrees minutes seconds - DMS" (e.g. 40° 26′ 46″ N), then call the function only with
    # Dg and Min arguments, like dg2dec(varb, Dg="°", Min="′N").
    # Same applies when there is no seconds symbol (e.g. 45°12'7.38).
    # Note that one should not use blank spaces in Dg, Min or Sec arguments (will return NA).
    # For more details on formats see:
    # https://en.wikipedia.org/wiki/Geographic_coordinate_conversion#Coordinate_format_conversion

    # Use paste0("[", Dg, Min, Sec, "]") to build regex [] pattern
    # therefore, strsplit() will split string "varb" by what symbols you give to Dg, Min, Sec
    DMS <- sapply(strsplit(varb, paste0('[', Dg, Min, Sec, ']')), as.numeric)

    # DMS is a matrix; first row contains degrees; second - minutes; third - seconds.
    # If the format is "degrees decimal minutes" (e.g. 40° 26.767′ N) and not
    # "degrees minutes seconds" (e.g. 40° 26′ 46″ N), then the matrix has only two valid rows:
    # first row contains degrees; the second - minutes;
    # therefore, compute conversion for seconds only if there are more than 2 rows in DMS
    # and Sec is different from NA (if there are seconds in the DMS format)
    decdg <- abs(DMS[1, ]) + DMS[2, ]/60 + ifelse(dim(DMS)[1] > 2  & !is.na(Sec), DMS[3, ]/3600, 0)

    # all cordinates from Southern or Western Hemispheres become negative in their decimal format
    SW <- grepl(pattern = SW.Hemisphere, x = varb, ignore.case = TRUE)
    return(ifelse(SW, -1, 1) * decdg)
}

# convert latitude and longitute,
# will create two new columns
# careful with the DMS format, it actually looks something like this (even if not visible at first look):
#  "43°37'42\" N"
#  "074°47'50\" W"
# where seconds are identified by (\" N) or (\" W), because \ is a metacharacter in regular expression,
# R will have to deal with it in an ungly way, escaping it like \\\\,
# that is why you see below Sec='\\\\" N'
links_extract_dt[, Latitude_dec := dg2dec(Latitude, Dg='°', Min="'", Sec='\\\\" N')]
links_extract_dt[, Longitude_dec := dg2dec(Longitude, Dg='°', Min="'", Sec='\\\\" W')]

# set column order as desired
setcolorder(links_extract_dt,
            neworder = c("Pond Name", "Town", "County",
                         "Latitude", "Longitude", "Latitude_dec", "Longitude_dec", "Elevation",
                         "Surface Area", "Volume", "Max Depth", "Mean Depth",
                         "Dates", "Names", "Caughts"))

# save results to csv file
write.csv(links_extract_dt, "links_extract_dt.csv", row.names = FALSE)
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Script to read list of lake links and parse through them
	# for Leana
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	library(XML)
	library(stringr)
	library(data.table)

	#####################################################
	#####################################################
	# Define helper functions
	#####################################################
	#####################################################

	# ===================================================
	# Function to extract between given edging characters
	# ===================================================
	# my_txt <- nodes[idx]
	# left <- "Pond Name: "
	# right <- ""
	extract_between <- function(my_txt, left, right){
	# Function to extract characters between given characters
	# __ Argumets:
	# my_txt = Character variable / line of text from where to extract
	# left = characters - left edge to extract
	# right = characters - right edge to extract;
	# note: use right = "" to flag that there is no right part
	# __ Return value:
	# returns character

	# if the riht part is "" (nothing)
	if (right=="")
	{
	# split the text in two, take only the last (2nd) part
	split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
	return(split_1_2nd_part)
	} else
	{
	# split the text in two, take only the last (2nd) part
	split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
	# split the 2nd part from above in two and take only the first part
	split_2_1st_part <- strsplit(x = split_1_2nd_part, split = right)[[1]][1]
	return(split_2_1st_part)
	}
	}


	# ===================================================
	# Function to extract values from each given link
	# ===================================================

	get_from_html <- function(link)
	# Function to read & extract values from a given html link
	# __ Argumets:
	# link = a html link
	# example
	# link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=040837&pname=BALSAM%20LAKE"
	# __ Return value:
	# returns a data frame with extracted values
	# some rows are recycled (repeated), like:
	# Pond Name, Town, County, Latitude, Longitude, Elevation, Surface Area, Volume, Max Depth, Mean Depth
	# Note: warnings like "row names were found from a short variable and have been discarded"
	# should be neglected.
	{
	# ---------------------------------------------------
	# Read data from HTML structure - node identification
	# ---------------------------------------------------

	# parse the html document
	thepage_html <- htmlParse(link)
	# extract and simplify information; XPath expression "//li" is used to identify nodes
	# chaning the XPath expression can identify nodes in different ways
	nodes <- sapply(getNodeSet(doc = thepage_html, path = "//li"), xmlValue)
	# sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)

	search_what <- c("Pond Name", "Town", "County",
	"Latitude", "Longitude", "Elevation",
	"Surface Area", "Volume", "Max Depth", "Mean Depth")
	extr_1 <- sapply(search_what,
	function(what)
	{
	idx <- grep(pattern = what, x = nodes, fixed = TRUE)
	extract_between(my_txt = nodes[idx], left = ': ', right = "")
	})

	extr_1 <- as.data.frame(extr_1)

	# NOTE - spliting nodes with //div doesn't help further
	# nodes <- sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)
	# search_what <- c("Name: ", "Date: ", "# caught: ")
	# extr_2 <- sapply(search_what,
	# function(what)
	# {
	# idx <- grep(pattern = what, x = nodes, fixed = TRUE)
	# extract_between(my_txt = nodes[idx], left = ': ', right = "")
	# })

	# ---------------------------------------------------
	# Read data from HTML structure - as plain text
	# ---------------------------------------------------

	# reads as a string (each HTML line appears as an element within a character vector)
	thepage <- readLines(link)
	# write.table(thepage, "file_example.txt")

	# ___ reads dates
	# find the indices of the lines that contain the pattern "id=\"historic_report_fish_date\">Date:"
	idx.dates <- grep(pattern = "id=\"historic_report_fish_date\">Date:", x = thepage, fixed = TRUE)
	# from each line extract the date format;
	# lapply is used to iterate through the lines
	dates <- lapply(thepage[idx.dates],
	function(x)
	str_extract(string = x,
	pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}"))

	# ___ read fish names
	# find the indices of the lines that contain the pattern "id=\"historic_report_fish_species\">Name:"
	idx.names <- grep(pattern = "id=\"historic_report_fish_species\">Name:", x = thepage, fixed = TRUE)
	# now extract the names that appear between the pattern ">Name: " and "</div>"
	names <- lapply(thepage[idx.names],
	function(row)
	extract_between(my_txt = row, left = '>Name: ', right = "</div>"))

	# ___ read caughts info
	# find the indices of the lines that contain the pattern "id=\"historic_report_fish_nocaught\"># caught:"
	idx.caughts <- grep(pattern = "id=\"historic_report_fish_nocaught\"># caught:", x = thepage, fixed = TRUE)
	caughts <- lapply(thepage[idx.caughts], function(x) str_extract(string = x, pattern = "[[:digit:]]{1,100}"))


	# ---------------------------------------------------
	# put extracted data in a data frame
	# ---------------------------------------------------

	ALS_Sampling <- data.frame(Dates = unlist(dates),
	Names = unlist(names),
	Caughts = unlist(caughts))

	extraction <- cbind(t(extr_1), ALS_Sampling)

	return(extraction)
	}

	#####################################################
	#####################################################
	# Read from links
	#####################################################
	#####################################################

	# read the list of links as data frame
	links <- read.table("links.txt")

	# it takes like 25-30 min to read through all pages
	system.time( links_extract <- apply(links, MARGIN = 1, FUN = get_from_html) )

	# bind all data frames from the list into a big data table
	links_extract_dt <- data.table::rbindlist(links_extract)

	# transform all columns from factor to character
	str(links_extract_dt) # check structure before conversion
	links_extract_dt[, names(links_extract_dt) := lapply(.SD, as.character)]
	str(links_extract_dt) # check structure after conversion

	#####################################################
	#####################################################
	# Transform latitude and longitute from DMS format
	# to decimal format and save results
	#####################################################
	#####################################################

	# ===================================================
	# Function to convert coordinates from DMS or DdM
	# formats to decimal format
	# ===================================================
	# DMS = "degrees minutes seconds"; DdM = "degrees decimal minutes"
	# more on my blog:
	# https://seethedatablog.wordpress.com/2016/10/30/r-converting-coordinates-from-dms-format-to-decimal/#more-837
	dg2dec <- function(varb, Dg=NA, Min=NA, Sec=NA, SW.Hemisphere="S\|W") {
	# Dg=decimal, Min=minutes and Sec=seconds;
	# NOTE 1 - if the format is "degrees decimal minutes - DdM" (e.g. 40° 26.767′ N) and not
	# "degrees minutes seconds - DMS" (e.g. 40° 26′ 46″ N), then call the function only with
	# Dg and Min arguments, like dg2dec(varb, Dg="°", Min="′N").
	# Same applies when there is no seconds symbol (e.g. 45°12'7.38).
	# Note that one should not use blank spaces in Dg, Min or Sec arguments (will return NA).
	# For more details on formats see:
	# https://en.wikipedia.org/wiki/Geographic_coordinate_conversion#Coordinate_format_conversion

	# Use paste0("[", Dg, Min, Sec, "]") to build regex [] pattern
	# therefore, strsplit() will split string "varb" by what symbols you give to Dg, Min, Sec
	DMS <- sapply(strsplit(varb, paste0('[', Dg, Min, Sec, ']')), as.numeric)

	# DMS is a matrix; first row contains degrees; second - minutes; third - seconds.
	# If the format is "degrees decimal minutes" (e.g. 40° 26.767′ N) and not
	# "degrees minutes seconds" (e.g. 40° 26′ 46″ N), then the matrix has only two valid rows:
	# first row contains degrees; the second - minutes;
	# therefore, compute conversion for seconds only if there are more than 2 rows in DMS
	# and Sec is different from NA (if there are seconds in the DMS format)
	decdg <- abs(DMS[1, ]) + DMS[2, ]/60 + ifelse(dim(DMS)[1] > 2 & !is.na(Sec), DMS[3, ]/3600, 0)

	# all cordinates from Southern or Western Hemispheres become negative in their decimal format
	SW <- grepl(pattern = SW.Hemisphere, x = varb, ignore.case = TRUE)
	return(ifelse(SW, -1, 1) * decdg)
	}

	# convert latitude and longitute,
	# will create two new columns
	# careful with the DMS format, it actually looks something like this (even if not visible at first look):
	# "43°37'42\" N"
	# "074°47'50\" W"
	# where seconds are identified by (\" N) or (\" W), because \ is a metacharacter in regular expression,
	# R will have to deal with it in an ungly way, escaping it like \\\\,
	# that is why you see below Sec='\\\\" N'
	links_extract_dt[, Latitude_dec := dg2dec(Latitude, Dg='°', Min="'", Sec='\\\\" N')]
	links_extract_dt[, Longitude_dec := dg2dec(Longitude, Dg='°', Min="'", Sec='\\\\" W')]

	# set column order as desired
	setcolorder(links_extract_dt,
	neworder = c("Pond Name", "Town", "County",
	"Latitude", "Longitude", "Latitude_dec", "Longitude_dec", "Elevation",
	"Surface Area", "Volume", "Max Depth", "Mean Depth",
	"Dates", "Names", "Caughts"))

	# save results to csv file
	write.csv(links_extract_dt, "links_extract_dt.csv", row.names = FALSE)