Last active January 9, 2022 09:22
Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF
# Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF
# Given a PDF (e.g. of a book) and a manually generated list of subject items,
# this gist will produce a table containing each item and the page number where it occurs
# This table is then written as a DOCX file.
# Written in R <>
# MIT License
# Copyright © 2022 Michael Bach
# History
# =======
# 2022-01-09 gisted it
# 2022-01-08 begun
# global settings
kPreamblePages = 6 # often page numbering begins a little later than physical 1st page
kPages2ignore = c(1, 2, 5) # some pages contain irrelevant content
kPath2PDF = "Bach_Sehphaenomene_03.pdf"
kNameOfOutputFile = "Bach-Stichwortverzeichnis.docx"
# list of subject items, here just a short sample
items = c("lateral", "Farb", "Hemmung", "Bewegung", "Helligkeit", "Leuchtdichte",
"Gestalt", "Winkel", "Größe", "Raum", "kognitiv", "Phänomen")
library("tidyverse"); library("pdftools"); library("stringr")
# Function to convert array of hit pages to comma-delimited list string
pagesArray2CommaString = function(hitPages) {
s = ""
for (hit in hitPages) {
if (hit == -3) hit = "III" # this is for my special case, should be omitted or generalised
s = paste0(s , hit, ", ")
return(substr(s,1,nchar(s)-2)) # remove last 2 chars (", ")
# read in all PDF pages
allPages <- tolower(pdf_text(kPath2PDF))
allPages[kPages2ignore] = ""
# search all and build pages list
itemsAndPages = data.frame(items); i = 0
for (item in items) {
i = i + 1
hits = as_tibble(str_locate(allPages, tolower(item))) |> select(start)
hitPages = which(hits != - kPreamblePages
itemsAndPages$pages[i] = pagesArray2CommaString(hitPages)
itemsAndPages <- itemsAndPages[order(itemsAndPages$items),]
# convert to DOCX
library("officer") #
stichwortverz_doc <- read_docx() |> body_add_par("Stichwortverzeichnis") |> body_add_par(" ")
for (i in 1:nrow(itemsAndPages)) {
stichwortverz_doc <-
stichwortverz_doc |>
body_add_par(paste0(itemsAndPages$items[i], ":\t", itemsAndPages$pages[i]))
# create output file, readable by Libre Office, Word, Pages etc.
print(stichwortverz_doc, target = kNameOfOutputFile)
