Last active
April 9, 2017 08:52
-
-
Save valentinitnelav/3e81766928e37a4bdadcedf97450f44e to your computer and use it in GitHub Desktop.
Read list items with {XML} using the XPath selector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load library | |
library(XML) | |
# ----------------------- | |
# Read the web page | |
# ----------------------- | |
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND" | |
# NOTE: is ethical to store the page and not read it unnecessarily too many times, | |
# overloading their server | |
link.scrap <- htmlParse(link) | |
# ----------------------- | |
# Read all list items from the page | |
# ----------------------- | |
all.list.items <- sapply(getNodeSet(doc = link.scrap, path = "//li"), xmlValue) | |
all.list.items[1:5] | |
## [1] "Location/General" | |
## [2] "Pond Name: ALLEGANY BROOK POND" | |
## [3] "Pond #: 020225B" | |
## [4] "Town: Black Brook" | |
## [5] "County: Clinton" | |
# ----------------------- | |
# Read a particular list item by using its XPath selector | |
# ----------------------- | |
pond.name.node <- getNodeSet(doc = link.scrap, | |
path = '//*[@id="historic_report_location"]/ul/ul/li[1]') | |
pond.name.node | |
## [[1]] | |
## <li style="list-style-type:none">Pond Name: ALLEGANY BROOK POND</li> | |
## attr(,"class") | |
## [1] "XMLNodeSet" | |
pond.name <- xmlValue(pond.name.node[[1]]) | |
pond.name | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
# This is the same as: | |
all.list.items[2] | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
# ----------------------- | |
# Select multiple list item via their XPath selectors | |
# ----------------------- | |
# Use a vector of XPath selectors in path argument of xpathApply(). | |
# Note the usage of xpathApply() together with a call to xmlValue() function. | |
# This is an alternative to using getNodeSet() + xmlValue() above | |
items.lst <- xpathApply(doc = link.scrap, | |
path = c('//*[@id="historic_report_location"]/ul/ul/li[1]', | |
'//*[@id="historic_report_location"]/ul/ul/li[3]'), | |
fun = xmlValue, | |
trim = TRUE) | |
items.lst | |
## [[1]] | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
## [[2]] | |
## [1] "Town: Black Brook" | |
# Transform to a character vector | |
items.chr.vect <- unlist(items.lst) | |
items.chr.vect | |
## [1] "Pond Name: ALLEGANY BROOK POND" "Town: Black Brook" | |
# Or transform to a data.frame object | |
items.df <- as.data.frame(items.lst, | |
col.names = c("Pond Name", "Town"), | |
stringsAsFactors = FALSE) | |
items.df | |
## Pond.Name Town | |
## 1 Pond Name: ALLEGANY BROOK POND Town: Black Brook |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment