Skip to content

Instantly share code, notes, and snippets.

@matt-dray
Last active July 10, 2023 09:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matt-dray/d4837f106bcee80ea39235b6465a7cac to your computer and use it in GitHub Desktop.
Save matt-dray/d4837f106bcee80ea39235b6465a7cac to your computer and use it in GitHub Desktop.
{officer} is an R package that lets you extract elements of a Word document, including tables, into a tidy dataframe. I've written a function to 're-rectangularise' an extracted Word table into an R dataframe.
# Functions building on the {officer} package:
# https://davidgohel.github.io/officer/
# You can read more about these functions in a blog post:
# https://www.rostrum.blog/2023/06/07/rectangular-officer/
# There are other solutions. You can also try {docxtractr} by Bob Rudis
# (on CRAN), which doesn't depend on {officer}, or {officerExtras} by Eli
# Pousson (on GitHub).
rectangularise_tables <- function(
docx_summary, # output dataframe from docx_summary
assume_headers = TRUE, # assume headers in first row?
type_convert = TRUE # try to coerce columns to most likely data type?
) {
# Check inputs
is_data.frame <- inherits(docx_summary, "data.frame")
docx_summary_names <- c(
"doc_index", "content_type", "style_name", "text", "level", "num_id",
"row_id", "is_header", "cell_id", "col_span", "row_span"
) # column names we can expect in the output from docx_summary
is_docx_summary <- all(names(docx_summary) %in% docx_summary_names)
if (!is_data.frame | !is_docx_summary) {
stop(
paste(
"Argument 'docx_summary' must be a data.frame created with",
"'officer::docx_summary'."
),
call. = FALSE
)
}
# Get only the rows that relate to Word tables
docx_summary_tables <-
docx_summary[docx_summary[["content_type"]] %in% "table cell", ]
# Get the ID value for each Word table
doc_indices <- unique(docx_summary_tables[["doc_index"]])
# Initiate an empty list to hold dataframe representations of the Word tables
tables_out <- vector(mode = "list", length = length(doc_indices))
names(tables_out) <- paste0("doc_index_", doc_indices)
# For each Word table, 'rectangularise' into a dataframe and add to the list
for (doc_index in doc_indices) {
docx_summary_table <-
docx_summary_tables[docx_summary_tables[["doc_index"]] == doc_index, ]
extracted_table <- .rectangularise_table(docx_summary_table, assume_headers)
list_element_name <- paste0("doc_index_", doc_index)
tables_out[[list_element_name]] <- extracted_table
}
# Optionally convert columns to appropriate type (integer, etc)
if (type_convert) {
tables_out <- lapply(tables_out, type.convert, as.is = TRUE)
}
return(tables_out)
}
.rectangularise_table <- function(
table_cells, # docx_summary output filtered for 'table cells' only
assume_headers = TRUE # assume headers in first row?
) {
# Check inputs
is_table_cells <- all(table_cells[["content_type"]] == "table cell")
is_one_table <- length(unique(table_cells[["doc_index"]])) == 1
if (!is_table_cells | !is_one_table) {
stop(
paste(
"Argument 'table_cells' must be a dataframe created with",
"'officer::docx_summary' where 'content_type' is filtered for",
"'table cell' only."
),
call. = FALSE
)
}
# Split each Word table into a list element, isolate headers and cell contents
cell_id_split <- split(table_cells, table_cells[["cell_id"]])
headers <- lapply(cell_id_split, function(x) x[x[["is_header"]], "text"])
content <- lapply(cell_id_split, function(x) x[!x[["is_header"]], "text"])
table_out <- as.data.frame(content)
# Column headers are identified by TRUE in the is_header column, but may not
# be marked up as such. Use them as dataframe headers if they exist.
has_headers <- length(unlist(headers)) > 0
if (has_headers) {
names(table_out) <- headers
}
# If headers are not identified by is_header, assume that the first row of the
# Word table contains the headers. The user can control this behaviour with
# the argument assume_headers.
if (!has_headers & assume_headers) {
headers <- table_out[1, ] # assume first row is headers
table_out <- table_out[2:nrow(table_out), ] # rest of table is content
names(table_out) <- headers
}
return(table_out)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment