matt-dray/rectangularise-tables.R

## rectangularise-tables.R
# Functions building on the {officer} package:
#   https://davidgohel.github.io/officer/

# You can read more about these functions in a blog post:
#   https://www.rostrum.blog/2023/06/07/rectangular-officer/

# There are other solutions. You can also try {docxtractr} by Bob Rudis
# (on CRAN), which doesn't depend on {officer}, or {officerExtras} by Eli
# Pousson (on GitHub).

rectangularise_tables <- function(
    docx_summary,  # output dataframe from docx_summary
    assume_headers = TRUE,  # assume headers in first row?
    type_convert = TRUE  # try to coerce columns to most likely data type?
) {

  # Check inputs

  is_data.frame <- inherits(docx_summary, "data.frame")

  docx_summary_names <- c(
    "doc_index", "content_type", "style_name", "text", "level", "num_id",
    "row_id", "is_header", "cell_id", "col_span", "row_span"
  )  # column names we can expect in the output from docx_summary

  is_docx_summary <- all(names(docx_summary) %in% docx_summary_names)

  if (!is_data.frame | !is_docx_summary) {
    stop(
      paste(
        "Argument 'docx_summary' must be a data.frame created with",
        "'officer::docx_summary'."
      ),
      call. = FALSE
    )
  }

  # Get only the rows that relate to Word tables
  docx_summary_tables <-
    docx_summary[docx_summary[["content_type"]] %in% "table cell", ]

  # Get the ID value for each Word table
  doc_indices <- unique(docx_summary_tables[["doc_index"]])

  # Initiate an empty list to hold dataframe representations of the Word tables
  tables_out <- vector(mode = "list", length = length(doc_indices))
  names(tables_out) <- paste0("doc_index_", doc_indices)

  # For each Word table, 'rectangularise' into a dataframe and add to the list
  for (doc_index in doc_indices) {

    docx_summary_table <-
      docx_summary_tables[docx_summary_tables[["doc_index"]] == doc_index, ]

    extracted_table <- .rectangularise_table(docx_summary_table, assume_headers)

    list_element_name <- paste0("doc_index_", doc_index)
    tables_out[[list_element_name]] <- extracted_table

  }

  # Optionally convert columns to appropriate type (integer, etc)
  if (type_convert) {
    tables_out <- lapply(tables_out, type.convert, as.is = TRUE)
  }

  return(tables_out)

}

.rectangularise_table <- function(
    table_cells,  # docx_summary output filtered for 'table cells' only
    assume_headers = TRUE  # assume headers in first row?
) {

  # Check inputs

  is_table_cells <- all(table_cells[["content_type"]] == "table cell")
  is_one_table <- length(unique(table_cells[["doc_index"]])) == 1

  if (!is_table_cells | !is_one_table) {
    stop(
      paste(
        "Argument 'table_cells' must be a dataframe created with",
        "'officer::docx_summary' where 'content_type' is filtered for",
        "'table cell' only."
      ),
      call. = FALSE
    )
  }

  # Split each Word table into a list element, isolate headers and cell contents
  cell_id_split <- split(table_cells, table_cells[["cell_id"]])
  headers <- lapply(cell_id_split, function(x) x[x[["is_header"]], "text"])
  content <- lapply(cell_id_split, function(x) x[!x[["is_header"]], "text"])
  table_out <- as.data.frame(content)

  # Column headers are identified by TRUE in the is_header column, but may not
  # be marked up as such. Use them as dataframe headers if they exist.
  has_headers <- length(unlist(headers)) > 0
  if (has_headers) {
    names(table_out) <- headers
  }

  # If headers are not identified by is_header, assume that the first row of the
  # Word table contains the headers. The user can control this behaviour with
  # the argument assume_headers.
  if (!has_headers & assume_headers) {
    headers <- table_out[1, ]  # assume first row is headers
    table_out <- table_out[2:nrow(table_out), ]  # rest of table is content
    names(table_out) <- headers
  }

  return(table_out)

}
	# Functions building on the {officer} package:
	# https://davidgohel.github.io/officer/

	# You can read more about these functions in a blog post:
	# https://www.rostrum.blog/2023/06/07/rectangular-officer/

	# There are other solutions. You can also try {docxtractr} by Bob Rudis
	# (on CRAN), which doesn't depend on {officer}, or {officerExtras} by Eli
	# Pousson (on GitHub).

	rectangularise_tables <- function(
	docx_summary, # output dataframe from docx_summary
	assume_headers = TRUE, # assume headers in first row?
	type_convert = TRUE # try to coerce columns to most likely data type?
	) {

	# Check inputs

	is_data.frame <- inherits(docx_summary, "data.frame")

	docx_summary_names <- c(
	"doc_index", "content_type", "style_name", "text", "level", "num_id",
	"row_id", "is_header", "cell_id", "col_span", "row_span"
	) # column names we can expect in the output from docx_summary

	is_docx_summary <- all(names(docx_summary) %in% docx_summary_names)

	if (!is_data.frame \| !is_docx_summary) {
	stop(
	paste(
	"Argument 'docx_summary' must be a data.frame created with",
	"'officer::docx_summary'."
	),
	call. = FALSE
	)
	}

	# Get only the rows that relate to Word tables
	docx_summary_tables <-
	docx_summary[docx_summary[["content_type"]] %in% "table cell", ]

	# Get the ID value for each Word table
	doc_indices <- unique(docx_summary_tables[["doc_index"]])

	# Initiate an empty list to hold dataframe representations of the Word tables
	tables_out <- vector(mode = "list", length = length(doc_indices))
	names(tables_out) <- paste0("doc_index_", doc_indices)

	# For each Word table, 'rectangularise' into a dataframe and add to the list
	for (doc_index in doc_indices) {

	docx_summary_table <-
	docx_summary_tables[docx_summary_tables[["doc_index"]] == doc_index, ]

	extracted_table <- .rectangularise_table(docx_summary_table, assume_headers)

	list_element_name <- paste0("doc_index_", doc_index)
	tables_out[[list_element_name]] <- extracted_table

	}

	# Optionally convert columns to appropriate type (integer, etc)
	if (type_convert) {
	tables_out <- lapply(tables_out, type.convert, as.is = TRUE)
	}

	return(tables_out)

	}

	.rectangularise_table <- function(
	table_cells, # docx_summary output filtered for 'table cells' only
	assume_headers = TRUE # assume headers in first row?
	) {

	# Check inputs

	is_table_cells <- all(table_cells[["content_type"]] == "table cell")
	is_one_table <- length(unique(table_cells[["doc_index"]])) == 1

	if (!is_table_cells \| !is_one_table) {
	stop(
	paste(
	"Argument 'table_cells' must be a dataframe created with",
	"'officer::docx_summary' where 'content_type' is filtered for",
	"'table cell' only."
	),
	call. = FALSE
	)
	}

	# Split each Word table into a list element, isolate headers and cell contents
	cell_id_split <- split(table_cells, table_cells[["cell_id"]])
	headers <- lapply(cell_id_split, function(x) x[x[["is_header"]], "text"])
	content <- lapply(cell_id_split, function(x) x[!x[["is_header"]], "text"])
	table_out <- as.data.frame(content)

	# Column headers are identified by TRUE in the is_header column, but may not
	# be marked up as such. Use them as dataframe headers if they exist.
	has_headers <- length(unlist(headers)) > 0
	if (has_headers) {
	names(table_out) <- headers
	}

	# If headers are not identified by is_header, assume that the first row of the
	# Word table contains the headers. The user can control this behaviour with
	# the argument assume_headers.
	if (!has_headers & assume_headers) {
	headers <- table_out[1, ] # assume first row is headers
	table_out <- table_out[2:nrow(table_out), ] # rest of table is content
	names(table_out) <- headers
	}

	return(table_out)

	}