Last active
March 26, 2017 19:49
-
-
Save valentinitnelav/8c5fa64c088497104e418ea10ad6fd57 to your computer and use it in GitHub Desktop.
Read list items with {rvest} using CSS or XPath selectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load library | |
library(rvest) | |
# ----------------------- | |
# Read the web page | |
# ----------------------- | |
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND" | |
# NOTE: is ethical to store the page and not read it unnecessarily too many times, | |
# overloading their server | |
link.scrap <- read_html(link) | |
# ----------------------- | |
# Read all list items from the page | |
# ----------------------- | |
all.list.items <- html_nodes(x = link.scrap, | |
css = "li") %>% | |
html_text() | |
all.list.items[1:5] | |
## [1] "Location/General" | |
## [2] "Pond Name: ALLEGANY BROOK POND" | |
## [3] "Pond #: 020225B" | |
## [4] "Town: Black Brook" | |
## [5] "County: Clinton" | |
# ----------------------- | |
# Read a particular list item - use its CSS selector | |
# ----------------------- | |
pond.name <- html_nodes(x = link.scrap, | |
css = "#historic_report_location > ul > ul > li:nth-child(1)") %>% | |
html_text() | |
pond.name | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
# This is the same as: | |
all.list.items[2] | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
# ----------------------- | |
# Or, read a particular list item by using its XPath selector | |
# ----------------------- | |
pond.name <- html_nodes(x = link.scrap, | |
xpath = '//*[@id="historic_report_location"]/ul/ul/li[1]') %>% | |
html_text() | |
pond.name | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
# ----------------------- | |
# The non-piping (without the %>%) goes like: | |
# ----------------------- | |
pond.name.node <- html_nodes(x = link.scrap, | |
xpath = '//*[@id="historic_report_location"]/ul/ul/li[1]') | |
pond.name.node | |
## {xml_nodeset (1)} | |
## [1] <li style="list-style-type:none">Pond Name: ALLEGANY BROOK POND</li> | |
pond.name <- html_text(pond.name.node) | |
pond.name | |
## [1] "Pond Name: ALLEGANY BROOK POND" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment