valentinitnelav/rvest_Read_div.R

## rvest_Read_div.R
# Read <div> HTML tag with {rvest} using CSS selector
# ====================================================

# Load library
library(rvest)

# -----------------------
# Read the web page
# -----------------------
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- read_html(link)

# -----------------------
# Read division
# -----------------------
div.location <-
    html_nodes(x   = link.scrap,
               css = '#historic_report_location') %>%
    html_text(trim = TRUE) %>% # note the trim = TRUE to trim leading and trailing spaces
    strsplit(split = '\n') %>% # split by \n (new line), will return a list
    unlist() %>% # or use .[[1]] or `[[`(1) to select only the first element [[1]] of the list
    trimws()     # remove leading and trailing whitespaces
    # If the `trim = TRUE` is not used in `html_text(trim = TRUE)` above,
    # then an alternative for skipping unwanted empty values would be to subset:
    # .[. != ""] # skip empty values, where . (dot) symbolize data as inherited
    # from the operations above.
    # If doing so, then don’t forget the `%>%` operator after `trimws()` above.

div.location
##  [1] "Location/General"               "Pond Name: ALLEGANY BROOK POND" "Pond #: 020225B"
##  [4] "Town: Black Brook"              "County: Clinton"                "USGS Quad: Redford"
##  [7] "Watershed: Champlain"           "In the Adk park?: Y"            "Part of the ALTM program?: N"
## [10] "Ownership: Private"             "Primitive Area: None"           "Wilderness Area: None"
## [13] "Wild Forest Area: None"

# To understand why the need of splitting by \n (new line) was needed,
# run only the html_nodes() and html_text() part:
div.location <-
    html_nodes(x   = link.scrap,
               css = '#historic_report_location') %>%
    html_text(trim = TRUE)

# Now, print div.location with print() and cat()
# note that cat() knows how to interpret \n (new line)
print(div.location)
cat(div.location)
# All in all, the wanted information is separated by \n (new line)
	# Read <div> HTML tag with {rvest} using CSS selector
	# ====================================================

	# Load library
	library(rvest)

	# -----------------------
	# Read the web page
	# -----------------------
	link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
	# NOTE: is ethical to store the page and not read it unnecessarily too many times,
	# overloading their server
	link.scrap <- read_html(link)

	# -----------------------
	# Read division
	# -----------------------
	div.location <-
	html_nodes(x = link.scrap,
	css = '#historic_report_location') %>%
	html_text(trim = TRUE) %>% # note the trim = TRUE to trim leading and trailing spaces
	strsplit(split = '\n') %>% # split by \n (new line), will return a list
	unlist() %>% # or use .[[1]] or `[[`(1) to select only the first element [[1]] of the list
	trimws() # remove leading and trailing whitespaces
	# If the `trim = TRUE` is not used in `html_text(trim = TRUE)` above,
	# then an alternative for skipping unwanted empty values would be to subset:
	# .[. != ""] # skip empty values, where . (dot) symbolize data as inherited
	# from the operations above.
	# If doing so, then don’t forget the `%>%` operator after `trimws()` above.

	div.location
	## [1] "Location/General" "Pond Name: ALLEGANY BROOK POND" "Pond #: 020225B"
	## [4] "Town: Black Brook" "County: Clinton" "USGS Quad: Redford"
	## [7] "Watershed: Champlain" "In the Adk park?: Y" "Part of the ALTM program?: N"
	## [10] "Ownership: Private" "Primitive Area: None" "Wilderness Area: None"
	## [13] "Wild Forest Area: None"

	# To understand why the need of splitting by \n (new line) was needed,
	# run only the html_nodes() and html_text() part:
	div.location <-
	html_nodes(x = link.scrap,
	css = '#historic_report_location') %>%
	html_text(trim = TRUE)

	# Now, print div.location with print() and cat()
	# note that cat() knows how to interpret \n (new line)
	print(div.location)
	cat(div.location)
	# All in all, the wanted information is separated by \n (new line)