Last active
January 9, 2022 09:22
-
-
Save michaelbach/673b705cc06223a8ae6de4c5a7dfa56f to your computer and use it in GitHub Desktop.
Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF | |
# | |
# Given a PDF (e.g. of a book) and a manually generated list of subject items, | |
# this gist will produce a table containing each item and the page number where it occurs | |
# This table is then written as a DOCX file. | |
# | |
# Written in R <https://www.r-project.org/> | |
# | |
# MIT License | |
# Copyright © 2022 Michael Bach | |
# | |
# | |
# History | |
# ======= | |
# | |
# 2022-01-09 gisted it | |
# 2022-01-08 begun | |
###################################################################################### | |
# global settings | |
kPreamblePages = 6 # often page numbering begins a little later than physical 1st page | |
kPages2ignore = c(1, 2, 5) # some pages contain irrelevant content | |
kPath2PDF = "Bach_Sehphaenomene_03.pdf" | |
kNameOfOutputFile = "Bach-Stichwortverzeichnis.docx" | |
# list of subject items, here just a short sample | |
items = c("lateral", "Farb", "Hemmung", "Bewegung", "Helligkeit", "Leuchtdichte", | |
"Gestalt", "Winkel", "Größe", "Raum", "kognitiv", "Phänomen") | |
library("tidyverse"); library("pdftools"); library("stringr") | |
# Function to convert array of hit pages to comma-delimited list string | |
pagesArray2CommaString = function(hitPages) { | |
s = "" | |
for (hit in hitPages) { | |
if (hit == -3) hit = "III" # this is for my special case, should be omitted or generalised | |
s = paste0(s , hit, ", ") | |
} | |
return(substr(s,1,nchar(s)-2)) # remove last 2 chars (", ") | |
} | |
# read in all PDF pages | |
allPages <- tolower(pdf_text(kPath2PDF)) | |
allPages[kPages2ignore] = "" | |
# search all and build pages list | |
itemsAndPages = data.frame(items); i = 0 | |
for (item in items) { | |
i = i + 1 | |
hits = as_tibble(str_locate(allPages, tolower(item))) |> select(start) | |
hitPages = which(hits != is.na(hits)) - kPreamblePages | |
itemsAndPages$pages[i] = pagesArray2CommaString(hitPages) | |
} | |
itemsAndPages <- itemsAndPages[order(itemsAndPages$items),] | |
View(itemsAndPages) | |
# convert to DOCX | |
library("officer") #https://www.r-bloggers.com/2020/02/creating-ms-word-reports-using-the-officer-package/ | |
stichwortverz_doc <- read_docx() |> body_add_par("Stichwortverzeichnis") |> body_add_par(" ") | |
for (i in 1:nrow(itemsAndPages)) { | |
stichwortverz_doc <- | |
stichwortverz_doc |> | |
body_add_par(paste0(itemsAndPages$items[i], ":\t", itemsAndPages$pages[i])) | |
} | |
# create output file, readable by Libre Office, Word, Pages etc. | |
print(stichwortverz_doc, target = kNameOfOutputFile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment