nacnudus/xml_to_df.R

## xml_to_df.R
``` r
library(xml2)
library(dplyr)
library(purrr)
library(stringr)

# From the root node:
# If has_children, then recurse.
# Otherwise, attributes, value and children (nested) to data frame.

xml_to_df <- function(doc, ns = xml_ns(doc)) {
  node_to_df <- function(node) {
    # Filter the attributes for ones that aren't namespaces
    # x <- list(.index = 0, .name = xml_name(node, ns))
    x <- list(.name = xml_name(node, ns))
    # Attributes as column headers, and their values in the first row
    attrs <- xml_attrs(node)
    if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]}
    if (length(attrs) > 0) {x <- c(x, attrs)}
    # Build data frame manually, to avoid as.data.frame's good intentions
    children <- xml_children(node)
    if (length(children) >= 1) {
      x <-
        children %>%
        # Recurse here
        map(node_to_df) %>%
        split_by(".name") %>%
        map(bind_rows) %>%
        map(list) %>%
        {c(x, .)}
      attr(x, "row.names") <- 1L
      class(x) <- c("tbl_df", "data.frame")
    } else {
      x$.value <- xml_text(node)
    }
    x
  }
  node_to_df(doc)
}


xml <- "
<messages>
  <note id=\"501\">
    <to>Tove</to>
    <from>Jani</from>
    <heading>Reminder</heading>
    <body>Don't forget me this weekend!</body>
  </note>
  <note id=\"502\">
    <to>Jani</to>
    <from>Tove</from>
    <heading>Re: Reminder</heading>
    <body>I will not</body>
  </note>
</messages>"

xml_df <- xml_to_df(read_xml(xml))
xml_df
#> Source: local data frame [1 x 2]
#>
#>      .name           note
#>      (chr)          (chr)
#> 1 messages <tbl_df [2,6]>

xml_df$note
#> [[1]]
#> Source: local data frame [2 x 6]
#>
#>   .name    id           body           from        heading             to
#>   (chr) (chr)          (chr)          (chr)          (chr)          (chr)
#> 1  note   501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
#> 2  note   502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>

tidyr::unnest(xml_df$note[[1]], body)
#> Source: local data frame [2 x 7]
#>
#>   .name    id           from        heading             to .name
#>   (chr) (chr)          (chr)          (chr)          (chr) (chr)
#> 1  note   501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>  body
#> 2  note   502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>  body
#> Variables not shown: .value (chr)

x <- read_xml('
 <root>
   <doc1 xmlns = "http://foo.com"><baz /></doc1>
   <doc2 xmlns = "http://bar.com"><baz /></doc2>
 </root>
')
xml_to_df(x)
#> Source: local data frame [1 x 3]
#>
#>   .name        d1:doc1        d2:doc2
#>   (chr)          (chr)          (chr)
#> 1  root <tbl_df [1,2]> <tbl_df [1,2]>

# jennybc's linked code to import xml from Google sheets
library(googlesheets)
library(httr)
pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full"
(ss <- gs_ws_feed(pts_ws_feed))
#>                   Spreadsheet title: test-gs-public-testing-sheet
#>                  Spreadsheet author: rpackagetest
#>   Date of googlesheets registration: 2016-06-03 00:02:56 GMT
#>     Date of last spreadsheet update: 2016-03-11 05:37:48 GMT
#>                          visibility: public
#>                         permissions: rw
#>                             version: new
#>
#> Contains 8 worksheets:
#> (Title): (Nominal worksheet extent as rows x columns)
#> embedded_empty_cells: 8 x 7
#> special_chars: 1000 x 26
#> diabolical_column_names: 4 x 8
#> shipwrecks: 1000 x 26
#> for_resizing: 1799 x 30
#> for_updating: 1000 x 26
#> empty: 1000 x 26
#> colnames_only: 1000 x 26
#>
#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8
#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/
col_names <- TRUE
ws <- "embedded_empty_cells"
index <- match(ws, ss$ws$ws_title)
the_url <- ss$ws$listfeed[index]
req <- GET(the_url)
rc <- read_xml(content(req, as = "text", encoding = "UTF-8"))
ns <- xml_ns_rename(xml_ns(rc), d1 = "feed")

# Parsing jennybc's xml with xml_to_df
x <- xml_to_df(rc, ns)
x$`feed:entry`[[1]] %>%
  select(starts_with("gsx:")) %>%
  map(bind_rows, .id = "row") %>%
  bind_rows
#> Source: local data frame [23 x 3]
#>
#>      row         .name      .value
#>    (chr)         (chr)       (chr)
#> 1      1    gsx:_chk2m    Americas
#> 2      3    gsx:_chk2m    Americas
#> 3      4    gsx:_chk2m    Americas
#> 4      1   gsx:country   Argentina
#> 5      2   gsx:country   Argentina
#> 6      3   gsx:country
#> 7      4   gsx:country   Argentina
#> 8      1 gsx:gdppercap 5911.315053
#> 9      2 gsx:gdppercap 6856.856212
#> 10     3 gsx:gdppercap 7133.166023
#> ..   ...           ...         ...
```
	``` r
	library(xml2)
	library(dplyr)
	library(purrr)
	library(stringr)

	# From the root node:
	# If has_children, then recurse.
	# Otherwise, attributes, value and children (nested) to data frame.

	xml_to_df <- function(doc, ns = xml_ns(doc)) {
	node_to_df <- function(node) {
	# Filter the attributes for ones that aren't namespaces
	# x <- list(.index = 0, .name = xml_name(node, ns))
	x <- list(.name = xml_name(node, ns))
	# Attributes as column headers, and their values in the first row
	attrs <- xml_attrs(node)
	if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]}
	if (length(attrs) > 0) {x <- c(x, attrs)}
	# Build data frame manually, to avoid as.data.frame's good intentions
	children <- xml_children(node)
	if (length(children) >= 1) {
	x <-
	children %>%
	# Recurse here
	map(node_to_df) %>%
	split_by(".name") %>%
	map(bind_rows) %>%
	map(list) %>%
	{c(x, .)}
	attr(x, "row.names") <- 1L
	class(x) <- c("tbl_df", "data.frame")
	} else {
	x$.value <- xml_text(node)
	}
	x
	}
	node_to_df(doc)
	}


	xml <- "
	<messages>
	<note id=\"501\">
	<to>Tove</to>
	<from>Jani</from>
	<heading>Reminder</heading>
	<body>Don't forget me this weekend!</body>
	</note>
	<note id=\"502\">
	<to>Jani</to>
	<from>Tove</from>
	<heading>Re: Reminder</heading>
	<body>I will not</body>
	</note>
	</messages>"

	xml_df <- xml_to_df(read_xml(xml))
	xml_df
	#> Source: local data frame [1 x 2]
	#>
	#> .name note
	#> (chr) (chr)
	#> 1 messages <tbl_df [2,6]>

	xml_df$note
	#> [[1]]
	#> Source: local data frame [2 x 6]
	#>
	#> .name id body from heading to
	#> (chr) (chr) (chr) (chr) (chr) (chr)
	#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
	#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>

	tidyr::unnest(xml_df$note[[1]], body)
	#> Source: local data frame [2 x 7]
	#>
	#> .name id from heading to .name
	#> (chr) (chr) (chr) (chr) (chr) (chr)
	#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
	#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
	#> Variables not shown: .value (chr)

	x <- read_xml('
	<root>
	<doc1 xmlns = "http://foo.com"><baz /></doc1>
	<doc2 xmlns = "http://bar.com"><baz /></doc2>
	</root>
	')
	xml_to_df(x)
	#> Source: local data frame [1 x 3]
	#>
	#> .name d1:doc1 d2:doc2
	#> (chr) (chr) (chr)
	#> 1 root <tbl_df [1,2]> <tbl_df [1,2]>

	# jennybc's linked code to import xml from Google sheets
	library(googlesheets)
	library(httr)
	pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full"
	(ss <- gs_ws_feed(pts_ws_feed))
	#> Spreadsheet title: test-gs-public-testing-sheet
	#> Spreadsheet author: rpackagetest
	#> Date of googlesheets registration: 2016-06-03 00:02:56 GMT
	#> Date of last spreadsheet update: 2016-03-11 05:37:48 GMT
	#> visibility: public
	#> permissions: rw
	#> version: new
	#>
	#> Contains 8 worksheets:
	#> (Title): (Nominal worksheet extent as rows x columns)
	#> embedded_empty_cells: 8 x 7
	#> special_chars: 1000 x 26
	#> diabolical_column_names: 4 x 8
	#> shipwrecks: 1000 x 26
	#> for_resizing: 1799 x 30
	#> for_updating: 1000 x 26
	#> empty: 1000 x 26
	#> colnames_only: 1000 x 26
	#>
	#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8
	#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/
	col_names <- TRUE
	ws <- "embedded_empty_cells"
	index <- match(ws, ss$ws$ws_title)
	the_url <- ss$ws$listfeed[index]
	req <- GET(the_url)
	rc <- read_xml(content(req, as = "text", encoding = "UTF-8"))
	ns <- xml_ns_rename(xml_ns(rc), d1 = "feed")

	# Parsing jennybc's xml with xml_to_df
	x <- xml_to_df(rc, ns)
	x$`feed:entry`[[1]] %>%
	select(starts_with("gsx:")) %>%
	map(bind_rows, .id = "row") %>%
	bind_rows
	#> Source: local data frame [23 x 3]
	#>
	#> row .name .value
	#> (chr) (chr) (chr)
	#> 1 1 gsx:_chk2m Americas
	#> 2 3 gsx:_chk2m Americas
	#> 3 4 gsx:_chk2m Americas
	#> 4 1 gsx:country Argentina
	#> 5 2 gsx:country Argentina
	#> 6 3 gsx:country
	#> 7 4 gsx:country Argentina
	#> 8 1 gsx:gdppercap 5911.315053
	#> 9 2 gsx:gdppercap 6856.856212
	#> 10 3 gsx:gdppercap 7133.166023
	#> .. ... ... ...
	```