valentinitnelav/XML_read_tbl.R

## XML_read_tbl.R
# Read <table> HTML tag with {XML} library
# ====================================================

# Load library
library(XML)

# =======================
# Read the web page [accessed 08-Apr-2017]
# =======================
link <- "http://www.theplantlist.org/1.1/statistics/"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- htmlParse(link)

# =======================
# Read all tables from the page,
# then select desired table
# =======================
tbls.lst <- readHTMLTable(link.scrap)
my.tbl.1 <- tbls.lst[[1]] # select first table
# Note that some unwanted columns were read as well
my.tbl.1
##        V1         V2      V3    V4
##1 <U+25D5>   Accepted 350,699 33.0%
##2 <U+25D5>    Synonym 470,624 44.2%
##3 <U+25D5>   Unplaced     243  0.0%
##4 <U+25D5> Unassessed 242,469 22.8%

# Also all data was read as character!
str(my.tbl.1)
##'data.frame':	4 obs. of  4 variables:
## $ V1: Factor w/ 1 level "<U+25D5>""| __truncated__: 1 1 1 1
## $ V2: Factor w/ 4 levels "Accepted","Synonym",..: 1 2 4 3
## $ V3: Factor w/ 4 levels "242,469","243",..: 3 4 2 1
## $ V4: Factor w/ 4 levels "0.0%","22.8%",..: 3 4 1 2

# =======================
# Read specific table(s) from the page
# using directly the table's XPath selector
# =======================
# -----------------------
# Simple reading using XPath selector
# -----------------------
# gives the same results as above
my.tbl.2 <- xpathApply(doc  = link.scrap,
                       path = '//*[@id="columns"]/section/div[1]/table',
                       fun  = readHTMLTable)[[1]]

# -----------------------
# Adjusting for header, column classes & other tweaks
# -----------------------
my.tbl.3 <- xpathApply(doc  = link.scrap,
                       path = '//*[@id="columns"]/section/div[1]/table',
                       fun  = readHTMLTable,
                       header     = c("Status", "Total", "Total_prc"),
                       colClasses = list(NULL, "character", "FormattedInteger", "Percent"),
                       stringsAsFactors = FALSE,
                       skip.rows  = 1L,
                       trim       = TRUE)[[1]]
my.tbl.3
##      Status  Total Total_prc
##1   Accepted 350699      33.0
##2    Synonym 470624      44.2
##3   Unplaced    243       0.0
##4 Unassessed 242469      22.8

# The type of data is as desired now
str(my.tbl.3)
##'data.frame':	4 obs. of  3 variables:
## $ Status   : chr  "Accepted" "Synonym" "Unplaced" "Unassessed"
## $ Total    : int  350699 470624 243 242469
## $ Total_prc: num  33 44.2 0 22.8

# -----------------------
# Select multiple tables via their XPath selectors
# -----------------------
# Use a vector of XPath selectors
tbls.lst.2 <- xpathApply(doc  = link.scrap,
                         path = c('//*[@id="columns"]/section/div[1]/table',
                                  '//*[@id="columns"]/section/table[1]'),
                         fun  = readHTMLTable,
                         stringsAsFactors = FALSE)
tbls.lst.2
##[[1]]
##        V1         V2      V3    V4
##1 <U+25D5>   Accepted 350,699 33.0%
##2 <U+25D5>    Synonym 470,624 44.2%
##3 <U+25D5>   Unplaced     243  0.0%
##4 <U+25D5> Unassessed 242,469 22.8%

##[[2]]
##  V1                V2      V3      V4  V5      V6      V7    V8
##1      High confidence 149,349 229,242   0       0 378,591 35.6%
##2    Medium confidence 193,013 214,107   0       0 407,120 38.3%
##3       Low confidence   8,337  27,275 243 242,469 278,324 26.2%
	# Read <table> HTML tag with {XML} library
	# ====================================================

	# Load library
	library(XML)

	# =======================
	# Read the web page [accessed 08-Apr-2017]
	# =======================
	link <- "http://www.theplantlist.org/1.1/statistics/"
	# NOTE: is ethical to store the page and not read it unnecessarily too many times,
	# overloading their server
	link.scrap <- htmlParse(link)

	# =======================
	# Read all tables from the page,
	# then select desired table
	# =======================
	tbls.lst <- readHTMLTable(link.scrap)
	my.tbl.1 <- tbls.lst[[1]] # select first table
	# Note that some unwanted columns were read as well
	my.tbl.1
	## V1 V2 V3 V4
	##1 <U+25D5> Accepted 350,699 33.0%
	##2 <U+25D5> Synonym 470,624 44.2%
	##3 <U+25D5> Unplaced 243 0.0%
	##4 <U+25D5> Unassessed 242,469 22.8%

	# Also all data was read as character!
	str(my.tbl.1)
	##'data.frame': 4 obs. of 4 variables:
	## $ V1: Factor w/ 1 level "<U+25D5>""\| __truncated__: 1 1 1 1
	## $ V2: Factor w/ 4 levels "Accepted","Synonym",..: 1 2 4 3
	## $ V3: Factor w/ 4 levels "242,469","243",..: 3 4 2 1
	## $ V4: Factor w/ 4 levels "0.0%","22.8%",..: 3 4 1 2

	# =======================
	# Read specific table(s) from the page
	# using directly the table's XPath selector
	# =======================
	# -----------------------
	# Simple reading using XPath selector
	# -----------------------
	# gives the same results as above
	my.tbl.2 <- xpathApply(doc = link.scrap,
	path = '//*[@id="columns"]/section/div[1]/table',
	fun = readHTMLTable)[[1]]

	# -----------------------
	# Adjusting for header, column classes & other tweaks
	# -----------------------
	my.tbl.3 <- xpathApply(doc = link.scrap,
	path = '//*[@id="columns"]/section/div[1]/table',
	fun = readHTMLTable,
	header = c("Status", "Total", "Total_prc"),
	colClasses = list(NULL, "character", "FormattedInteger", "Percent"),
	stringsAsFactors = FALSE,
	skip.rows = 1L,
	trim = TRUE)[[1]]
	my.tbl.3
	## Status Total Total_prc
	##1 Accepted 350699 33.0
	##2 Synonym 470624 44.2
	##3 Unplaced 243 0.0
	##4 Unassessed 242469 22.8

	# The type of data is as desired now
	str(my.tbl.3)
	##'data.frame': 4 obs. of 3 variables:
	## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed"
	## $ Total : int 350699 470624 243 242469
	## $ Total_prc: num 33 44.2 0 22.8

	# -----------------------
	# Select multiple tables via their XPath selectors
	# -----------------------
	# Use a vector of XPath selectors
	tbls.lst.2 <- xpathApply(doc = link.scrap,
	path = c('//*[@id="columns"]/section/div[1]/table',
	'//*[@id="columns"]/section/table[1]'),
	fun = readHTMLTable,
	stringsAsFactors = FALSE)
	tbls.lst.2
	##[[1]]
	## V1 V2 V3 V4
	##1 <U+25D5> Accepted 350,699 33.0%
	##2 <U+25D5> Synonym 470,624 44.2%
	##3 <U+25D5> Unplaced 243 0.0%
	##4 <U+25D5> Unassessed 242,469 22.8%

	##[[2]]
	## V1 V2 V3 V4 V5 V6 V7 V8
	##1 High confidence 149,349 229,242 0 0 378,591 35.6%
	##2 Medium confidence 193,013 214,107 0 0 407,120 38.3%
	##3 Low confidence 8,337 27,275 243 242,469 278,324 26.2%