valentinitnelav/XML_ReadListItems.R

## XML_ReadListItems.R
# Load library
library(XML)

# -----------------------
# Read the web page
# -----------------------
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- htmlParse(link)

# -----------------------
# Read all list items from the page
# -----------------------
all.list.items <- sapply(getNodeSet(doc = link.scrap, path = "//li"), xmlValue)
all.list.items[1:5]
## [1] "Location/General"
## [2] "Pond Name: ALLEGANY BROOK POND"
## [3] "Pond #: 020225B"
## [4] "Town: Black Brook"
## [5] "County: Clinton"

# -----------------------
# Read a particular list item by using its XPath selector
# -----------------------
pond.name.node <- getNodeSet(doc  = link.scrap,
                             path = '//*[@id="historic_report_location"]/ul/ul/li[1]')
pond.name.node
## [[1]]
## <li style="list-style-type:none">Pond Name: ALLEGANY BROOK POND</li>
## attr(,"class")
## [1] "XMLNodeSet"
pond.name <- xmlValue(pond.name.node[[1]])
pond.name
## [1] "Pond Name: ALLEGANY BROOK POND"

# This is the same as:
all.list.items[2]
## [1] "Pond Name: ALLEGANY BROOK POND"

# -----------------------
# Select multiple list item via their XPath selectors
# -----------------------
# Use a vector of XPath selectors in path argument of xpathApply().
# Note the usage of xpathApply() together with a call to xmlValue() function.
# This is an alternative to using getNodeSet() + xmlValue() above
items.lst <- xpathApply(doc  = link.scrap,
                        path = c('//*[@id="historic_report_location"]/ul/ul/li[1]',
                                 '//*[@id="historic_report_location"]/ul/ul/li[3]'),
                        fun  = xmlValue,
                        trim = TRUE)
items.lst
## [[1]]
##  [1] "Pond Name: ALLEGANY BROOK POND"
## [[2]]
##  [1] "Town: Black Brook"

# Transform to a character vector
items.chr.vect <- unlist(items.lst)
items.chr.vect
## [1] "Pond Name: ALLEGANY BROOK POND" "Town: Black Brook"

# Or transform to a data.frame object
items.df <- as.data.frame(items.lst,
                          col.names = c("Pond Name", "Town"),
                          stringsAsFactors = FALSE)
items.df
##                        Pond.Name              Town
## 1 Pond Name: ALLEGANY BROOK POND Town: Black Brook
	# Load library
	library(XML)

	# -----------------------
	# Read the web page
	# -----------------------
	link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
	# NOTE: is ethical to store the page and not read it unnecessarily too many times,
	# overloading their server
	link.scrap <- htmlParse(link)

	# -----------------------
	# Read all list items from the page
	# -----------------------
	all.list.items <- sapply(getNodeSet(doc = link.scrap, path = "//li"), xmlValue)
	all.list.items[1:5]
	## [1] "Location/General"
	## [2] "Pond Name: ALLEGANY BROOK POND"
	## [3] "Pond #: 020225B"
	## [4] "Town: Black Brook"
	## [5] "County: Clinton"

	# -----------------------
	# Read a particular list item by using its XPath selector
	# -----------------------
	pond.name.node <- getNodeSet(doc = link.scrap,
	path = '//*[@id="historic_report_location"]/ul/ul/li[1]')
	pond.name.node
	## [[1]]
	## <li style="list-style-type:none">Pond Name: ALLEGANY BROOK POND</li>
	## attr(,"class")
	## [1] "XMLNodeSet"
	pond.name <- xmlValue(pond.name.node[[1]])
	pond.name
	## [1] "Pond Name: ALLEGANY BROOK POND"

	# This is the same as:
	all.list.items[2]
	## [1] "Pond Name: ALLEGANY BROOK POND"

	# -----------------------
	# Select multiple list item via their XPath selectors
	# -----------------------
	# Use a vector of XPath selectors in path argument of xpathApply().
	# Note the usage of xpathApply() together with a call to xmlValue() function.
	# This is an alternative to using getNodeSet() + xmlValue() above
	items.lst <- xpathApply(doc = link.scrap,
	path = c('//*[@id="historic_report_location"]/ul/ul/li[1]',
	'//*[@id="historic_report_location"]/ul/ul/li[3]'),
	fun = xmlValue,
	trim = TRUE)
	items.lst
	## [[1]]
	## [1] "Pond Name: ALLEGANY BROOK POND"
	## [[2]]
	## [1] "Town: Black Brook"

	# Transform to a character vector
	items.chr.vect <- unlist(items.lst)
	items.chr.vect
	## [1] "Pond Name: ALLEGANY BROOK POND" "Town: Black Brook"

	# Or transform to a data.frame object
	items.df <- as.data.frame(items.lst,
	col.names = c("Pond Name", "Town"),
	stringsAsFactors = FALSE)
	items.df
	## Pond.Name Town
	## 1 Pond Name: ALLEGANY BROOK POND Town: Black Brook