Last active
November 5, 2024 09:44
-
-
Save matt-dray/d873ef08a674e542946b9dcdcd03f76e to your computer and use it in GitHub Desktop.
Basic use of the {officer} R package to extract a table from a Word document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Basic use of the {officer} package to scrape a table from a Word doc | |
# https://davidgohel.github.io/officer/ | |
# I've now written a couple of functions to do this task: | |
# https://gist.github.com/matt-dray/d4837f106bcee80ea39235b6465a7cac | |
# You can read more about those in a blog post: | |
# https://www.rostrum.blog/2023/06/07/rectangular-officer/ | |
# There are other solutions. You can also try {docxtractr} by Bon Rudis | |
# (on CRAN), which doesn't depend on {officer}, or {officerExtras} by Eli | |
# Pousson (on GitHub). | |
# Attach package | |
library(officer) | |
# Create a test docx file | |
doc_test <- read_docx() |> | |
body_add_par("Hello world!", style = "heading 1") |> | |
body_add_par("Below is a table", style = "Normal") |> | |
body_add_table(mtcars[1:3, 1:5]) |> | |
body_add_par("Below is another table", style = "Normal") |> | |
body_add_table(airquality[1:3, 1:5]) | |
# Save docx to temp location | |
temp_docx <- tempfile(fileext = ".docx") | |
print(doc_test, target = temp_docx) | |
# Read the file from temp path | |
doc_path <- list.files(tempdir(), pattern = ".docx$", full.names = TRUE) | |
doc_in <- read_docx(doc_path) | |
# Get the content of the document as a dataframe | |
content <- docx_summary(doc_in) | |
# Filter the table for a specific table | |
table_cells <- subset(content, content_type %in% "table cell" & doc_index == 3) | |
# The content of the table is in 'long' format, but we can 're-rectangularise' | |
table_names <- table_cells[table_cells$is_header, "text"] | |
table_content <- table_cells[!table_cells$is_header, "text"] | |
row_count <- nrow(table_cells) / nrow(table_cells[table_cells$is_header, ]) - 1 | |
table_mat <- as.data.frame(matrix(table_content, nrow = row_count)) | |
names(table_mat) <- table_names | |
table_mat | |
## mpg cyl disp hp drat | |
## 1 21.0 6 160 110 3.90 | |
## 2 21.0 6 160 110 3.90 | |
## 3 22.8 4 108 93 3.85 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment