Skip to content

Instantly share code, notes, and snippets.

@valentinitnelav
Created February 22, 2017 09:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save valentinitnelav/f60ab5e94cf80cbd1c5e1280a6551bc5 to your computer and use it in GitHub Desktop.
Save valentinitnelav/f60ab5e94cf80cbd1c5e1280a6551bc5 to your computer and use it in GitHub Desktop.
Script to read list of lake links and parse through them (for Leana)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Script to read list of lake links and parse through them
# for Leana
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
library(XML)
library(stringr)
library(data.table)
#####################################################
#####################################################
# Define helper functions
#####################################################
#####################################################
# ===================================================
# Function to extract between given edging characters
# ===================================================
# my_txt <- nodes[idx]
# left <- "Pond Name: "
# right <- ""
extract_between <- function(my_txt, left, right){
# Function to extract characters between given characters
# __ Argumets:
# my_txt = Character variable / line of text from where to extract
# left = characters - left edge to extract
# right = characters - right edge to extract;
# note: use right = "" to flag that there is no right part
# __ Return value:
# returns character
# if the riht part is "" (nothing)
if (right=="")
{
# split the text in two, take only the last (2nd) part
split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
return(split_1_2nd_part)
} else
{
# split the text in two, take only the last (2nd) part
split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2]
# split the 2nd part from above in two and take only the first part
split_2_1st_part <- strsplit(x = split_1_2nd_part, split = right)[[1]][1]
return(split_2_1st_part)
}
}
# ===================================================
# Function to extract values from each given link
# ===================================================
get_from_html <- function(link)
# Function to read & extract values from a given html link
# __ Argumets:
# link = a html link
# example
# link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=040837&pname=BALSAM%20LAKE"
# __ Return value:
# returns a data frame with extracted values
# some rows are recycled (repeated), like:
# Pond Name, Town, County, Latitude, Longitude, Elevation, Surface Area, Volume, Max Depth, Mean Depth
# Note: warnings like "row names were found from a short variable and have been discarded"
# should be neglected.
{
# ---------------------------------------------------
# Read data from HTML structure - node identification
# ---------------------------------------------------
# parse the html document
thepage_html <- htmlParse(link)
# extract and simplify information; XPath expression "//li" is used to identify nodes
# chaning the XPath expression can identify nodes in different ways
nodes <- sapply(getNodeSet(doc = thepage_html, path = "//li"), xmlValue)
# sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)
search_what <- c("Pond Name", "Town", "County",
"Latitude", "Longitude", "Elevation",
"Surface Area", "Volume", "Max Depth", "Mean Depth")
extr_1 <- sapply(search_what,
function(what)
{
idx <- grep(pattern = what, x = nodes, fixed = TRUE)
extract_between(my_txt = nodes[idx], left = ': ', right = "")
})
extr_1 <- as.data.frame(extr_1)
# NOTE - spliting nodes with //div doesn't help further
# nodes <- sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue)
# search_what <- c("Name: ", "Date: ", "# caught: ")
# extr_2 <- sapply(search_what,
# function(what)
# {
# idx <- grep(pattern = what, x = nodes, fixed = TRUE)
# extract_between(my_txt = nodes[idx], left = ': ', right = "")
# })
# ---------------------------------------------------
# Read data from HTML structure - as plain text
# ---------------------------------------------------
# reads as a string (each HTML line appears as an element within a character vector)
thepage <- readLines(link)
# write.table(thepage, "file_example.txt")
# ___ reads dates
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_date\">Date:"
idx.dates <- grep(pattern = "id=\"historic_report_fish_date\">Date:", x = thepage, fixed = TRUE)
# from each line extract the date format;
# lapply is used to iterate through the lines
dates <- lapply(thepage[idx.dates],
function(x)
str_extract(string = x,
pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}"))
# ___ read fish names
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_species\">Name:"
idx.names <- grep(pattern = "id=\"historic_report_fish_species\">Name:", x = thepage, fixed = TRUE)
# now extract the names that appear between the pattern ">Name: " and "</div>"
names <- lapply(thepage[idx.names],
function(row)
extract_between(my_txt = row, left = '>Name: ', right = "</div>"))
# ___ read caughts info
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_nocaught\"># caught:"
idx.caughts <- grep(pattern = "id=\"historic_report_fish_nocaught\"># caught:", x = thepage, fixed = TRUE)
caughts <- lapply(thepage[idx.caughts], function(x) str_extract(string = x, pattern = "[[:digit:]]{1,100}"))
# ---------------------------------------------------
# put extracted data in a data frame
# ---------------------------------------------------
ALS_Sampling <- data.frame(Dates = unlist(dates),
Names = unlist(names),
Caughts = unlist(caughts))
extraction <- cbind(t(extr_1), ALS_Sampling)
return(extraction)
}
#####################################################
#####################################################
# Read from links
#####################################################
#####################################################
# read the list of links as data frame
links <- read.table("links.txt")
# it takes like 25-30 min to read through all pages
system.time( links_extract <- apply(links, MARGIN = 1, FUN = get_from_html) )
# bind all data frames from the list into a big data table
links_extract_dt <- data.table::rbindlist(links_extract)
# transform all columns from factor to character
str(links_extract_dt) # check structure before conversion
links_extract_dt[, names(links_extract_dt) := lapply(.SD, as.character)]
str(links_extract_dt) # check structure after conversion
#####################################################
#####################################################
# Transform latitude and longitute from DMS format
# to decimal format and save results
#####################################################
#####################################################
# ===================================================
# Function to convert coordinates from DMS or DdM
# formats to decimal format
# ===================================================
# DMS = "degrees minutes seconds"; DdM = "degrees decimal minutes"
# more on my blog:
# https://seethedatablog.wordpress.com/2016/10/30/r-converting-coordinates-from-dms-format-to-decimal/#more-837
dg2dec <- function(varb, Dg=NA, Min=NA, Sec=NA, SW.Hemisphere="S|W") {
# Dg=decimal, Min=minutes and Sec=seconds;
# NOTE 1 - if the format is "degrees decimal minutes - DdM" (e.g. 40° 26.767′ N) and not
# "degrees minutes seconds - DMS" (e.g. 40° 26′ 46″ N), then call the function only with
# Dg and Min arguments, like dg2dec(varb, Dg="°", Min="′N").
# Same applies when there is no seconds symbol (e.g. 45°12'7.38).
# Note that one should not use blank spaces in Dg, Min or Sec arguments (will return NA).
# For more details on formats see:
# https://en.wikipedia.org/wiki/Geographic_coordinate_conversion#Coordinate_format_conversion
# Use paste0("[", Dg, Min, Sec, "]") to build regex [] pattern
# therefore, strsplit() will split string "varb" by what symbols you give to Dg, Min, Sec
DMS <- sapply(strsplit(varb, paste0('[', Dg, Min, Sec, ']')), as.numeric)
# DMS is a matrix; first row contains degrees; second - minutes; third - seconds.
# If the format is "degrees decimal minutes" (e.g. 40° 26.767′ N) and not
# "degrees minutes seconds" (e.g. 40° 26′ 46″ N), then the matrix has only two valid rows:
# first row contains degrees; the second - minutes;
# therefore, compute conversion for seconds only if there are more than 2 rows in DMS
# and Sec is different from NA (if there are seconds in the DMS format)
decdg <- abs(DMS[1, ]) + DMS[2, ]/60 + ifelse(dim(DMS)[1] > 2 & !is.na(Sec), DMS[3, ]/3600, 0)
# all cordinates from Southern or Western Hemispheres become negative in their decimal format
SW <- grepl(pattern = SW.Hemisphere, x = varb, ignore.case = TRUE)
return(ifelse(SW, -1, 1) * decdg)
}
# convert latitude and longitute,
# will create two new columns
# careful with the DMS format, it actually looks something like this (even if not visible at first look):
# "43°37'42\" N"
# "074°47'50\" W"
# where seconds are identified by (\" N) or (\" W), because \ is a metacharacter in regular expression,
# R will have to deal with it in an ungly way, escaping it like \\\\,
# that is why you see below Sec='\\\\" N'
links_extract_dt[, Latitude_dec := dg2dec(Latitude, Dg='°', Min="'", Sec='\\\\" N')]
links_extract_dt[, Longitude_dec := dg2dec(Longitude, Dg='°', Min="'", Sec='\\\\" W')]
# set column order as desired
setcolorder(links_extract_dt,
neworder = c("Pond Name", "Town", "County",
"Latitude", "Longitude", "Latitude_dec", "Longitude_dec", "Elevation",
"Surface Area", "Volume", "Max Depth", "Mean Depth",
"Dates", "Names", "Caughts"))
# save results to csv file
write.csv(links_extract_dt, "links_extract_dt.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment