michaelbach/GenerateSubjectItemsList.R

## GenerateSubjectItemsList.R
# Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF
#
# Given a PDF (e.g. of a book) and a manually generated list of subject items,
# this gist will produce a table containing each item and the page number where it occurs
# This table is then written as a DOCX file.
#
# Written in R <https://www.r-project.org/>
#
# MIT License
# Copyright © 2022 Michael Bach
#
#
# History
# =======
#
# 2022-01-09 gisted it
# 2022-01-08 begun

######################################################################################

# global settings
kPreamblePages = 6 # often page numbering begins a little later than physical 1st page
kPages2ignore = c(1, 2, 5) # some pages contain irrelevant content
kPath2PDF = "Bach_Sehphaenomene_03.pdf"
kNameOfOutputFile = "Bach-Stichwortverzeichnis.docx"
# list of subject items, here just a short sample
items = c("lateral", "Farb", "Hemmung", "Bewegung", "Helligkeit", "Leuchtdichte",
          "Gestalt", "Winkel", "Größe", "Raum", "kognitiv", "Phänomen")


library("tidyverse");  library("pdftools");  library("stringr")


# Function to convert array of hit pages to comma-delimited list string
pagesArray2CommaString = function(hitPages) {
  s = ""
  for (hit in hitPages) {
    if (hit == -3) hit = "III" # this is for my special case, should be omitted or generalised
    s = paste0(s , hit, ", ")
  }
  return(substr(s,1,nchar(s)-2)) # remove last 2 chars (", ")
}


# read in all PDF pages
allPages <- tolower(pdf_text(kPath2PDF))
allPages[kPages2ignore] = ""


# search all and build pages list
itemsAndPages = data.frame(items);  i = 0
for (item in items) {
  i = i + 1
  hits = as_tibble(str_locate(allPages, tolower(item))) |> select(start)
  hitPages = which(hits != is.na(hits)) - kPreamblePages
  itemsAndPages$pages[i] = pagesArray2CommaString(hitPages)
}
itemsAndPages <- itemsAndPages[order(itemsAndPages$items),]
View(itemsAndPages)


# convert to DOCX
library("officer") #https://www.r-bloggers.com/2020/02/creating-ms-word-reports-using-the-officer-package/
stichwortverz_doc <- read_docx() |> body_add_par("Stichwortverzeichnis") |> body_add_par(" ")

for (i in 1:nrow(itemsAndPages)) {
  stichwortverz_doc <-
    stichwortverz_doc |>
    body_add_par(paste0(itemsAndPages$items[i], ":\t", itemsAndPages$pages[i]))
}
# create output file, readable by Libre Office, Word, Pages etc.
print(stichwortverz_doc, target = kNameOfOutputFile)
	# Generate a "subject items" directory ("Stichwortverzeichnis" in German) from a PDF
	#
	# Given a PDF (e.g. of a book) and a manually generated list of subject items,
	# this gist will produce a table containing each item and the page number where it occurs
	# This table is then written as a DOCX file.
	#
	# Written in R <https://www.r-project.org/>
	#
	# MIT License
	# Copyright © 2022 Michael Bach
	#
	#
	# History
	# =======
	#
	# 2022-01-09 gisted it
	# 2022-01-08 begun

	######################################################################################

	# global settings
	kPreamblePages = 6 # often page numbering begins a little later than physical 1st page
	kPages2ignore = c(1, 2, 5) # some pages contain irrelevant content
	kPath2PDF = "Bach_Sehphaenomene_03.pdf"
	kNameOfOutputFile = "Bach-Stichwortverzeichnis.docx"
	# list of subject items, here just a short sample
	items = c("lateral", "Farb", "Hemmung", "Bewegung", "Helligkeit", "Leuchtdichte",
	"Gestalt", "Winkel", "Größe", "Raum", "kognitiv", "Phänomen")


	library("tidyverse"); library("pdftools"); library("stringr")


	# Function to convert array of hit pages to comma-delimited list string
	pagesArray2CommaString = function(hitPages) {
	s = ""
	for (hit in hitPages) {
	if (hit == -3) hit = "III" # this is for my special case, should be omitted or generalised
	s = paste0(s , hit, ", ")
	}
	return(substr(s,1,nchar(s)-2)) # remove last 2 chars (", ")
	}


	# read in all PDF pages
	allPages <- tolower(pdf_text(kPath2PDF))
	allPages[kPages2ignore] = ""


	# search all and build pages list
	itemsAndPages = data.frame(items); i = 0
	for (item in items) {
	i = i + 1
	hits = as_tibble(str_locate(allPages, tolower(item))) \|> select(start)
	hitPages = which(hits != is.na(hits)) - kPreamblePages
	itemsAndPages$pages[i] = pagesArray2CommaString(hitPages)
	}
	itemsAndPages <- itemsAndPages[order(itemsAndPages$items),]
	View(itemsAndPages)


	# convert to DOCX
	library("officer") #https://www.r-bloggers.com/2020/02/creating-ms-word-reports-using-the-officer-package/
	stichwortverz_doc <- read_docx() \|> body_add_par("Stichwortverzeichnis") \|> body_add_par(" ")

	for (i in 1:nrow(itemsAndPages)) {
	stichwortverz_doc <-
	stichwortverz_doc \|>
	body_add_par(paste0(itemsAndPages$items[i], ":\t", itemsAndPages$pages[i]))
	}
	# create output file, readable by Libre Office, Word, Pages etc.
	print(stichwortverz_doc, target = kNameOfOutputFile)