v--/scanned_books.makefile

## scanned_books.makefile
# This is a GNU Make file specifically designed for building PDF books from a directory of images.
# It can process multiple images concurrently with `make --jobs=8`.
#
# Every book is different and has different processing requirements,
# and it often makes sense copying this file and adapting it for an individual book.
# Adapting usually involves modifying in non-trivial ways the targets for the individual pages.
#
# The script builds a PDF file with a table of contents and an OCR layer.
# I used to build DjVu files instead because of their better image compression, but I learned that
# it's better to provide a PDF myself than to let people use bad converters.
# Furthermore, PDF files with Group4-compressed bitonal images are as small as bitonal DjVu files.
# See the history of this gist for a script for building DjVu files.
#
# For building, we suppose that we have a directory of images named in ascending order, e.g. 0001.png, 0002.png, ....
# We also suppose that the name of the directory will be the name of the resulting book (this name can include spaces).
# A table of contents is added via bookmarks.txt in the same directory. The bookmark file format consists of blocks like
#
# BookmarkBegin
# BookmarkTitle: Chapter 1. Introduction
# BookmarkLevel: 1
# BookmarkPageNumber: 7
#
# BookmarkBegin
# BookmarkTitle: §1. Preliminary Notions
# BookmarkLevel: 2
# BookmarkPageNumber: 8
#
# (It is actually not limited to bookmarks and may contain other PDF metainformation).
#
# The script dependencies vary with the book, but they usually include
# * ImageMagick (https://imagemagick.org/) - for general-purpose image processing
# * unpaper (https://github.com/unpaper/unpaper) - for post-processing scanned pages
# * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) - for OCR
# * ocrmypdf (https://github.com/ocrmypdf/OCRmyPDF) - for performing OCR on an existing PDF file via the above tool
# * Ghostscript (https://www.ghostscript.com/) - for processing PDF files
# * djvulibre (https://github.com/traycold/djvulibre) - for working with DjVu files
# * dpsprep (https://github.com/kcroker/dpsprep) - for converting DjVu to PDF
#
# I do not like to put licenses on my code, so consider this makefile Unlicensed (https://unlicense.org/).

# Configuration variables
OCR_LANGUAGES := rus+eng+grc # Even in a book in Cyrillic, math features Latin and Greek letters
PAGE_RANGE := $(shell seq --format '%04g' 3 104) # The list of pages to process
EXECUTABLES = magick unpaper ocrmypdf pdftk # The list of binaries whose non-existence will fail the build

# Some technical variables
E := # This whitespace trick is from https://stackoverflow.com/a/56411000
OUTPUT_NAME_RAW := $(notdir $(shell pwd))
OUTPUT_NAME := $(subst $E $E,\ ,$(OUTPUT_NAME_RAW))
CHECKSUM := $(shell echo $(OUTPUT_NAME) | sum | cut --delimiter ' ' --fields 1)
TMP_DIR := /var/tmp/build-scanned-book/$(CHECKSUM)
PROCESSED_IMAGES := $(addprefix $(TMP_DIR)/,$(addsuffix _unpaper.pbm,$(PAGE_RANGE)))

# Check if the necessary executables exist
MISSING_DEPS := $(strip $(foreach exec, $(EXECUTABLES),\
	$(if $(shell which $(exec) 2>/dev/null),,$(exec)) \
))

$(if $(MISSING_DEPS),$(error Missing executables: $(MISSING_DEPS)),)

# Check if the necessary pages exist
MISSING_PAGES := $(strip $(foreach page, $(PAGE_RANGE),\
	$(if $(wildcard $(page).*),,$(page)) \
))

$(if $(MISSING_PAGES),$(error Missing pages: $(MISSING_PAGES)),)

.NOTINTERMEDIATE:
.PHONY: clean_tmp clean_pdf

# The main target that gets made by default
$(OUTPUT_NAME).pdf: $(TMP_DIR)/ocr.pdf bookmarks.txt
	pdftk $< update_info_utf8 bookmarks.txt output '$@'
	$(MAKE) clean_tmp

clean_tmp:
	rm --recursive --force $(TMP_DIR)

clean_pdf:
	rm --force '$(OUTPUT_NAME_RAW).pdf'

$(TMP_DIR):
	mkdir --parents $(TMP_DIR)

# The intermediate targets
$(TMP_DIR)/%_magick.pbm: %.png | $(TMP_DIR)
	magick $< -threshold 80% $@

# We can easily specialize a rule for a certain page
# For example, we can use an adaptive threshold when converting to a bitonal image
$(TMP_DIR)/0030_magick.pbm: 0030.png | $(TMP_DIR)
	magick $< -lat 20x20-5% $@

# Another way to specialize is to use the if function
# It is useful for when a list of pages needs to behave differently
# Here we add --no-deskew for pages 10, 20 and 30
# unpaper has bulk processing built-in, but we use it on a per-file basis
$(TMP_DIR)/%_unpaper.pbm: $(TMP_DIR)/%_magick.pbm
	unpaper $< $(if $(filter $*,10 20 30),--no-deskew,) $@

$(TMP_DIR)/combined.pdf: $(PROCESSED_IMAGES)
	magick $^ -define pdf:Title='$(OUTPUT_NAME_RAW)' -verbose -compress group4 $@

$(TMP_DIR)/ocr.pdf: $(TMP_DIR)/combined.pdf
	ocrmypdf --language=$(OCR_LANGUAGES) $< $@
	# This is a GNU Make file specifically designed for building PDF books from a directory of images.
	# It can process multiple images concurrently with `make --jobs=8`.
	#
	# Every book is different and has different processing requirements,
	# and it often makes sense copying this file and adapting it for an individual book.
	# Adapting usually involves modifying in non-trivial ways the targets for the individual pages.
	#
	# The script builds a PDF file with a table of contents and an OCR layer.
	# I used to build DjVu files instead because of their better image compression, but I learned that
	# it's better to provide a PDF myself than to let people use bad converters.
	# Furthermore, PDF files with Group4-compressed bitonal images are as small as bitonal DjVu files.
	# See the history of this gist for a script for building DjVu files.
	#
	# For building, we suppose that we have a directory of images named in ascending order, e.g. 0001.png, 0002.png, ....
	# We also suppose that the name of the directory will be the name of the resulting book (this name can include spaces).
	# A table of contents is added via bookmarks.txt in the same directory. The bookmark file format consists of blocks like
	#
	# BookmarkBegin
	# BookmarkTitle: Chapter 1. Introduction
	# BookmarkLevel: 1
	# BookmarkPageNumber: 7
	#
	# BookmarkBegin
	# BookmarkTitle: §1. Preliminary Notions
	# BookmarkLevel: 2
	# BookmarkPageNumber: 8
	#
	# (It is actually not limited to bookmarks and may contain other PDF metainformation).
	#
	# The script dependencies vary with the book, but they usually include
	# * ImageMagick (https://imagemagick.org/) - for general-purpose image processing
	# * unpaper (https://github.com/unpaper/unpaper) - for post-processing scanned pages
	# * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) - for OCR
	# * ocrmypdf (https://github.com/ocrmypdf/OCRmyPDF) - for performing OCR on an existing PDF file via the above tool
	# * Ghostscript (https://www.ghostscript.com/) - for processing PDF files
	# * djvulibre (https://github.com/traycold/djvulibre) - for working with DjVu files
	# * dpsprep (https://github.com/kcroker/dpsprep) - for converting DjVu to PDF
	#
	# I do not like to put licenses on my code, so consider this makefile Unlicensed (https://unlicense.org/).

	# Configuration variables
	OCR_LANGUAGES := rus+eng+grc # Even in a book in Cyrillic, math features Latin and Greek letters
	PAGE_RANGE := $(shell seq --format '%04g' 3 104) # The list of pages to process
	EXECUTABLES = magick unpaper ocrmypdf pdftk # The list of binaries whose non-existence will fail the build

	# Some technical variables
	E := # This whitespace trick is from https://stackoverflow.com/a/56411000
	OUTPUT_NAME_RAW := $(notdir $(shell pwd))
	OUTPUT_NAME := $(subst $E $E,\ ,$(OUTPUT_NAME_RAW))
	CHECKSUM := $(shell echo $(OUTPUT_NAME) \| sum \| cut --delimiter ' ' --fields 1)
	TMP_DIR := /var/tmp/build-scanned-book/$(CHECKSUM)
	PROCESSED_IMAGES := $(addprefix $(TMP_DIR)/,$(addsuffix _unpaper.pbm,$(PAGE_RANGE)))

	# Check if the necessary executables exist
	MISSING_DEPS := $(strip $(foreach exec, $(EXECUTABLES),\
	$(if $(shell which $(exec) 2>/dev/null),,$(exec)) \
	))

	$(if $(MISSING_DEPS),$(error Missing executables: $(MISSING_DEPS)),)

	# Check if the necessary pages exist
	MISSING_PAGES := $(strip $(foreach page, $(PAGE_RANGE),\
	$(if $(wildcard $(page).*),,$(page)) \
	))

	$(if $(MISSING_PAGES),$(error Missing pages: $(MISSING_PAGES)),)

	.NOTINTERMEDIATE:
	.PHONY: clean_tmp clean_pdf

	# The main target that gets made by default
	$(OUTPUT_NAME).pdf: $(TMP_DIR)/ocr.pdf bookmarks.txt
	pdftk $< update_info_utf8 bookmarks.txt output '$@'
	$(MAKE) clean_tmp

	clean_tmp:
	rm --recursive --force $(TMP_DIR)

	clean_pdf:
	rm --force '$(OUTPUT_NAME_RAW).pdf'

	$(TMP_DIR):
	mkdir --parents $(TMP_DIR)

	# The intermediate targets
	$(TMP_DIR)/%_magick.pbm: %.png \| $(TMP_DIR)
	magick $< -threshold 80% $@

	# We can easily specialize a rule for a certain page
	# For example, we can use an adaptive threshold when converting to a bitonal image
	$(TMP_DIR)/0030_magick.pbm: 0030.png \| $(TMP_DIR)
	magick $< -lat 20x20-5% $@

	# Another way to specialize is to use the if function
	# It is useful for when a list of pages needs to behave differently
	# Here we add --no-deskew for pages 10, 20 and 30
	# unpaper has bulk processing built-in, but we use it on a per-file basis
	$(TMP_DIR)/%_unpaper.pbm: $(TMP_DIR)/%_magick.pbm
	unpaper $< $(if $(filter $*,10 20 30),--no-deskew,) $@

	$(TMP_DIR)/combined.pdf: $(PROCESSED_IMAGES)
	magick $^ -define pdf:Title='$(OUTPUT_NAME_RAW)' -verbose -compress group4 $@

	$(TMP_DIR)/ocr.pdf: $(TMP_DIR)/combined.pdf
	ocrmypdf --language=$(OCR_LANGUAGES) $< $@