Last active
June 9, 2023 16:31
-
-
Save matt-dray/e25101dbab70009fad111857f8172ba6 to your computer and use it in GitHub Desktop.
Extract tables from Word files that are in different subfolders, then combine them.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract and combine tables from multiple Word files | |
# This script creates some dummy docx files in temporary subfolders to mimic a | |
# user's filesystem. It then uses docxtractr::read_docx() to extract all the | |
# tables, and combines them with rbind(). | |
# A follow-up to my blogpost: | |
# https://www.rostrum.blog/2023/06/07/rectangular-officer/ | |
# Attach packages (all are available from CRAN) | |
library(docxtractr) # to extract tables from docx files | |
library(officer) # to create dummy docx files | |
library(charlatan) # to generate fake data | |
# Create multiple dummy docs files in separate temporary folders | |
my_folder <- tempdir() # temporary locations to store the files | |
n_files <- 5 # the number of dummy files to generate | |
for (i in seq(n_files)) { | |
# Create subfolders | |
subfolder_name <- paste0("subfolder_", i) | |
dir.create(file.path(my_folder, subfolder_name)) | |
# Create dummy dataframe | |
n_fake <- 10 # number of fake data items to generate | |
temp_df <- data.frame( | |
name = ch_name(n_fake), | |
job = ch_job(n_fake), | |
phone = ch_phone_number(n_fake) | |
) | |
# Add dummy dataframe to a docx file and save it | |
path <- file.path(my_folder, subfolder_name, paste0("df_", i, ".docx")) | |
officer::read_docx() |> body_add_table(temp_df) |> print(target = path) | |
} | |
# Get the file paths to all the docx files | |
docx_paths <- list.files( | |
my_folder, | |
pattern = ".docx$", | |
full.names = TRUE, # return full filepaths | |
recursive = TRUE # look in all subfolders | |
) | |
# Preallocate a list to be filled with extracted tables, one element per file | |
extracted_tables <- vector("list", n_files) | |
# Extract tables and add to the list (not tested: I think that read_docx will | |
# read .doc files, but only if you have LibreOffice installed. | |
for (i in docx_paths) { | |
tables <- docxtractr::read_docx(i) |> docx_extract_all_tbls() | |
extracted_tables[basename(i)] <- tables | |
} | |
# In this simple demo, the dataframes in each list element can be appended | |
# because they all have the same column names and types. | |
do.call(rbind, extracted_tables) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment