Last active
July 10, 2023 09:07
-
-
Save matt-dray/d4837f106bcee80ea39235b6465a7cac to your computer and use it in GitHub Desktop.
{officer} is an R package that lets you extract elements of a Word document, including tables, into a tidy dataframe. I've written a function to 're-rectangularise' an extracted Word table into an R dataframe.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Functions building on the {officer} package: | |
# https://davidgohel.github.io/officer/ | |
# You can read more about these functions in a blog post: | |
# https://www.rostrum.blog/2023/06/07/rectangular-officer/ | |
# There are other solutions. You can also try {docxtractr} by Bob Rudis | |
# (on CRAN), which doesn't depend on {officer}, or {officerExtras} by Eli | |
# Pousson (on GitHub). | |
rectangularise_tables <- function( | |
docx_summary, # output dataframe from docx_summary | |
assume_headers = TRUE, # assume headers in first row? | |
type_convert = TRUE # try to coerce columns to most likely data type? | |
) { | |
# Check inputs | |
is_data.frame <- inherits(docx_summary, "data.frame") | |
docx_summary_names <- c( | |
"doc_index", "content_type", "style_name", "text", "level", "num_id", | |
"row_id", "is_header", "cell_id", "col_span", "row_span" | |
) # column names we can expect in the output from docx_summary | |
is_docx_summary <- all(names(docx_summary) %in% docx_summary_names) | |
if (!is_data.frame | !is_docx_summary) { | |
stop( | |
paste( | |
"Argument 'docx_summary' must be a data.frame created with", | |
"'officer::docx_summary'." | |
), | |
call. = FALSE | |
) | |
} | |
# Get only the rows that relate to Word tables | |
docx_summary_tables <- | |
docx_summary[docx_summary[["content_type"]] %in% "table cell", ] | |
# Get the ID value for each Word table | |
doc_indices <- unique(docx_summary_tables[["doc_index"]]) | |
# Initiate an empty list to hold dataframe representations of the Word tables | |
tables_out <- vector(mode = "list", length = length(doc_indices)) | |
names(tables_out) <- paste0("doc_index_", doc_indices) | |
# For each Word table, 'rectangularise' into a dataframe and add to the list | |
for (doc_index in doc_indices) { | |
docx_summary_table <- | |
docx_summary_tables[docx_summary_tables[["doc_index"]] == doc_index, ] | |
extracted_table <- .rectangularise_table(docx_summary_table, assume_headers) | |
list_element_name <- paste0("doc_index_", doc_index) | |
tables_out[[list_element_name]] <- extracted_table | |
} | |
# Optionally convert columns to appropriate type (integer, etc) | |
if (type_convert) { | |
tables_out <- lapply(tables_out, type.convert, as.is = TRUE) | |
} | |
return(tables_out) | |
} | |
.rectangularise_table <- function( | |
table_cells, # docx_summary output filtered for 'table cells' only | |
assume_headers = TRUE # assume headers in first row? | |
) { | |
# Check inputs | |
is_table_cells <- all(table_cells[["content_type"]] == "table cell") | |
is_one_table <- length(unique(table_cells[["doc_index"]])) == 1 | |
if (!is_table_cells | !is_one_table) { | |
stop( | |
paste( | |
"Argument 'table_cells' must be a dataframe created with", | |
"'officer::docx_summary' where 'content_type' is filtered for", | |
"'table cell' only." | |
), | |
call. = FALSE | |
) | |
} | |
# Split each Word table into a list element, isolate headers and cell contents | |
cell_id_split <- split(table_cells, table_cells[["cell_id"]]) | |
headers <- lapply(cell_id_split, function(x) x[x[["is_header"]], "text"]) | |
content <- lapply(cell_id_split, function(x) x[!x[["is_header"]], "text"]) | |
table_out <- as.data.frame(content) | |
# Column headers are identified by TRUE in the is_header column, but may not | |
# be marked up as such. Use them as dataframe headers if they exist. | |
has_headers <- length(unlist(headers)) > 0 | |
if (has_headers) { | |
names(table_out) <- headers | |
} | |
# If headers are not identified by is_header, assume that the first row of the | |
# Word table contains the headers. The user can control this behaviour with | |
# the argument assume_headers. | |
if (!has_headers & assume_headers) { | |
headers <- table_out[1, ] # assume first row is headers | |
table_out <- table_out[2:nrow(table_out), ] # rest of table is content | |
names(table_out) <- headers | |
} | |
return(table_out) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment