Skip to content

Instantly share code, notes, and snippets.

@valentinitnelav
Last active April 9, 2017 08:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save valentinitnelav/3e81766928e37a4bdadcedf97450f44e to your computer and use it in GitHub Desktop.
Save valentinitnelav/3e81766928e37a4bdadcedf97450f44e to your computer and use it in GitHub Desktop.
Read list items with {XML} using the XPath selector
# Load library
library(XML)
# -----------------------
# Read the web page
# -----------------------
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- htmlParse(link)
# -----------------------
# Read all list items from the page
# -----------------------
all.list.items <- sapply(getNodeSet(doc = link.scrap, path = "//li"), xmlValue)
all.list.items[1:5]
## [1] "Location/General"
## [2] "Pond Name: ALLEGANY BROOK POND"
## [3] "Pond #: 020225B"
## [4] "Town: Black Brook"
## [5] "County: Clinton"
# -----------------------
# Read a particular list item by using its XPath selector
# -----------------------
pond.name.node <- getNodeSet(doc = link.scrap,
path = '//*[@id="historic_report_location"]/ul/ul/li[1]')
pond.name.node
## [[1]]
## <li style="list-style-type:none">Pond Name: ALLEGANY BROOK POND</li>
## attr(,"class")
## [1] "XMLNodeSet"
pond.name <- xmlValue(pond.name.node[[1]])
pond.name
## [1] "Pond Name: ALLEGANY BROOK POND"
# This is the same as:
all.list.items[2]
## [1] "Pond Name: ALLEGANY BROOK POND"
# -----------------------
# Select multiple list item via their XPath selectors
# -----------------------
# Use a vector of XPath selectors in path argument of xpathApply().
# Note the usage of xpathApply() together with a call to xmlValue() function.
# This is an alternative to using getNodeSet() + xmlValue() above
items.lst <- xpathApply(doc = link.scrap,
path = c('//*[@id="historic_report_location"]/ul/ul/li[1]',
'//*[@id="historic_report_location"]/ul/ul/li[3]'),
fun = xmlValue,
trim = TRUE)
items.lst
## [[1]]
## [1] "Pond Name: ALLEGANY BROOK POND"
## [[2]]
## [1] "Town: Black Brook"
# Transform to a character vector
items.chr.vect <- unlist(items.lst)
items.chr.vect
## [1] "Pond Name: ALLEGANY BROOK POND" "Town: Black Brook"
# Or transform to a data.frame object
items.df <- as.data.frame(items.lst,
col.names = c("Pond Name", "Town"),
stringsAsFactors = FALSE)
items.df
## Pond.Name Town
## 1 Pond Name: ALLEGANY BROOK POND Town: Black Brook
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment