Created
February 22, 2017 09:42
-
-
Save valentinitnelav/f60ab5e94cf80cbd1c5e1280a6551bc5 to your computer and use it in GitHub Desktop.
Script to read list of lake links and parse through them (for Leana)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
# Script to read list of lake links and parse through them | |
# for Leana | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
library(XML) | |
library(stringr) | |
library(data.table) | |
##################################################### | |
##################################################### | |
# Define helper functions | |
##################################################### | |
##################################################### | |
# =================================================== | |
# Function to extract between given edging characters | |
# =================================================== | |
# my_txt <- nodes[idx] | |
# left <- "Pond Name: " | |
# right <- "" | |
extract_between <- function(my_txt, left, right){ | |
# Function to extract characters between given characters | |
# __ Argumets: | |
# my_txt = Character variable / line of text from where to extract | |
# left = characters - left edge to extract | |
# right = characters - right edge to extract; | |
# note: use right = "" to flag that there is no right part | |
# __ Return value: | |
# returns character | |
# if the riht part is "" (nothing) | |
if (right=="") | |
{ | |
# split the text in two, take only the last (2nd) part | |
split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2] | |
return(split_1_2nd_part) | |
} else | |
{ | |
# split the text in two, take only the last (2nd) part | |
split_1_2nd_part <- strsplit(x = my_txt, split = left)[[1]][2] | |
# split the 2nd part from above in two and take only the first part | |
split_2_1st_part <- strsplit(x = split_1_2nd_part, split = right)[[1]][1] | |
return(split_2_1st_part) | |
} | |
} | |
# =================================================== | |
# Function to extract values from each given link | |
# =================================================== | |
get_from_html <- function(link) | |
# Function to read & extract values from a given html link | |
# __ Argumets: | |
# link = a html link | |
# example | |
# link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=040837&pname=BALSAM%20LAKE" | |
# __ Return value: | |
# returns a data frame with extracted values | |
# some rows are recycled (repeated), like: | |
# Pond Name, Town, County, Latitude, Longitude, Elevation, Surface Area, Volume, Max Depth, Mean Depth | |
# Note: warnings like "row names were found from a short variable and have been discarded" | |
# should be neglected. | |
{ | |
# --------------------------------------------------- | |
# Read data from HTML structure - node identification | |
# --------------------------------------------------- | |
# parse the html document | |
thepage_html <- htmlParse(link) | |
# extract and simplify information; XPath expression "//li" is used to identify nodes | |
# chaning the XPath expression can identify nodes in different ways | |
nodes <- sapply(getNodeSet(doc = thepage_html, path = "//li"), xmlValue) | |
# sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue) | |
search_what <- c("Pond Name", "Town", "County", | |
"Latitude", "Longitude", "Elevation", | |
"Surface Area", "Volume", "Max Depth", "Mean Depth") | |
extr_1 <- sapply(search_what, | |
function(what) | |
{ | |
idx <- grep(pattern = what, x = nodes, fixed = TRUE) | |
extract_between(my_txt = nodes[idx], left = ': ', right = "") | |
}) | |
extr_1 <- as.data.frame(extr_1) | |
# NOTE - spliting nodes with //div doesn't help further | |
# nodes <- sapply(getNodeSet(doc = thepage_html, path = "//div"), xmlValue) | |
# search_what <- c("Name: ", "Date: ", "# caught: ") | |
# extr_2 <- sapply(search_what, | |
# function(what) | |
# { | |
# idx <- grep(pattern = what, x = nodes, fixed = TRUE) | |
# extract_between(my_txt = nodes[idx], left = ': ', right = "") | |
# }) | |
# --------------------------------------------------- | |
# Read data from HTML structure - as plain text | |
# --------------------------------------------------- | |
# reads as a string (each HTML line appears as an element within a character vector) | |
thepage <- readLines(link) | |
# write.table(thepage, "file_example.txt") | |
# ___ reads dates | |
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_date\">Date:" | |
idx.dates <- grep(pattern = "id=\"historic_report_fish_date\">Date:", x = thepage, fixed = TRUE) | |
# from each line extract the date format; | |
# lapply is used to iterate through the lines | |
dates <- lapply(thepage[idx.dates], | |
function(x) | |
str_extract(string = x, | |
pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}")) | |
# ___ read fish names | |
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_species\">Name:" | |
idx.names <- grep(pattern = "id=\"historic_report_fish_species\">Name:", x = thepage, fixed = TRUE) | |
# now extract the names that appear between the pattern ">Name: " and "</div>" | |
names <- lapply(thepage[idx.names], | |
function(row) | |
extract_between(my_txt = row, left = '>Name: ', right = "</div>")) | |
# ___ read caughts info | |
# find the indices of the lines that contain the pattern "id=\"historic_report_fish_nocaught\"># caught:" | |
idx.caughts <- grep(pattern = "id=\"historic_report_fish_nocaught\"># caught:", x = thepage, fixed = TRUE) | |
caughts <- lapply(thepage[idx.caughts], function(x) str_extract(string = x, pattern = "[[:digit:]]{1,100}")) | |
# --------------------------------------------------- | |
# put extracted data in a data frame | |
# --------------------------------------------------- | |
ALS_Sampling <- data.frame(Dates = unlist(dates), | |
Names = unlist(names), | |
Caughts = unlist(caughts)) | |
extraction <- cbind(t(extr_1), ALS_Sampling) | |
return(extraction) | |
} | |
##################################################### | |
##################################################### | |
# Read from links | |
##################################################### | |
##################################################### | |
# read the list of links as data frame | |
links <- read.table("links.txt") | |
# it takes like 25-30 min to read through all pages | |
system.time( links_extract <- apply(links, MARGIN = 1, FUN = get_from_html) ) | |
# bind all data frames from the list into a big data table | |
links_extract_dt <- data.table::rbindlist(links_extract) | |
# transform all columns from factor to character | |
str(links_extract_dt) # check structure before conversion | |
links_extract_dt[, names(links_extract_dt) := lapply(.SD, as.character)] | |
str(links_extract_dt) # check structure after conversion | |
##################################################### | |
##################################################### | |
# Transform latitude and longitute from DMS format | |
# to decimal format and save results | |
##################################################### | |
##################################################### | |
# =================================================== | |
# Function to convert coordinates from DMS or DdM | |
# formats to decimal format | |
# =================================================== | |
# DMS = "degrees minutes seconds"; DdM = "degrees decimal minutes" | |
# more on my blog: | |
# https://seethedatablog.wordpress.com/2016/10/30/r-converting-coordinates-from-dms-format-to-decimal/#more-837 | |
dg2dec <- function(varb, Dg=NA, Min=NA, Sec=NA, SW.Hemisphere="S|W") { | |
# Dg=decimal, Min=minutes and Sec=seconds; | |
# NOTE 1 - if the format is "degrees decimal minutes - DdM" (e.g. 40° 26.767′ N) and not | |
# "degrees minutes seconds - DMS" (e.g. 40° 26′ 46″ N), then call the function only with | |
# Dg and Min arguments, like dg2dec(varb, Dg="°", Min="′N"). | |
# Same applies when there is no seconds symbol (e.g. 45°12'7.38). | |
# Note that one should not use blank spaces in Dg, Min or Sec arguments (will return NA). | |
# For more details on formats see: | |
# https://en.wikipedia.org/wiki/Geographic_coordinate_conversion#Coordinate_format_conversion | |
# Use paste0("[", Dg, Min, Sec, "]") to build regex [] pattern | |
# therefore, strsplit() will split string "varb" by what symbols you give to Dg, Min, Sec | |
DMS <- sapply(strsplit(varb, paste0('[', Dg, Min, Sec, ']')), as.numeric) | |
# DMS is a matrix; first row contains degrees; second - minutes; third - seconds. | |
# If the format is "degrees decimal minutes" (e.g. 40° 26.767′ N) and not | |
# "degrees minutes seconds" (e.g. 40° 26′ 46″ N), then the matrix has only two valid rows: | |
# first row contains degrees; the second - minutes; | |
# therefore, compute conversion for seconds only if there are more than 2 rows in DMS | |
# and Sec is different from NA (if there are seconds in the DMS format) | |
decdg <- abs(DMS[1, ]) + DMS[2, ]/60 + ifelse(dim(DMS)[1] > 2 & !is.na(Sec), DMS[3, ]/3600, 0) | |
# all cordinates from Southern or Western Hemispheres become negative in their decimal format | |
SW <- grepl(pattern = SW.Hemisphere, x = varb, ignore.case = TRUE) | |
return(ifelse(SW, -1, 1) * decdg) | |
} | |
# convert latitude and longitute, | |
# will create two new columns | |
# careful with the DMS format, it actually looks something like this (even if not visible at first look): | |
# "43°37'42\" N" | |
# "074°47'50\" W" | |
# where seconds are identified by (\" N) or (\" W), because \ is a metacharacter in regular expression, | |
# R will have to deal with it in an ungly way, escaping it like \\\\, | |
# that is why you see below Sec='\\\\" N' | |
links_extract_dt[, Latitude_dec := dg2dec(Latitude, Dg='°', Min="'", Sec='\\\\" N')] | |
links_extract_dt[, Longitude_dec := dg2dec(Longitude, Dg='°', Min="'", Sec='\\\\" W')] | |
# set column order as desired | |
setcolorder(links_extract_dt, | |
neworder = c("Pond Name", "Town", "County", | |
"Latitude", "Longitude", "Latitude_dec", "Longitude_dec", "Elevation", | |
"Surface Area", "Volume", "Max Depth", "Mean Depth", | |
"Dates", "Names", "Caughts")) | |
# save results to csv file | |
write.csv(links_extract_dt, "links_extract_dt.csv", row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment