build-scanned-books.fish
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env fish | |
# This is a fish shell script for building DjVu and PDF books from a directory of images. | |
# Every book is different and has different processing requirements, | |
# and it often makes sense copying this script and adapting it for an individual book. | |
# Adapting usually involves modifying in non-trivial ways the process-page subroutine or the DjVu and PDF build steps. | |
# | |
# The script builds a DjVu file with a table of contents and an OCR layer. | |
# The DjVu file is then also converted to PDF because people tend to like PDFs more. | |
# You can also create PDFs directly, but this usually results in much much larger files. | |
# I learned the hard way that it's better to provide a PDF myself than to let people use bad converters. | |
# | |
# We suppose that we have a directory of images named in ascending order, for example 0001.png, 0002.png, .... | |
# This script must be ran inside the image directory, either by copying the script inside | |
# or by running the script while the CWD is set to it. | |
# A DjVu table of contents is added via bookmarks.txt in the same directory. For the bookmark file format, see | |
# https://ebooks.stackexchange.com/questions/7866/how-insert-the-outline-the-bookmarks-into-djvu | |
# | |
# This script works by processing the individual pages concurrently and then combining the results. | |
# Concurrency is achieved via GNU Parallel, which requires a separate script to run for each page. | |
# We have two scripts which we write to a temporary directory and the run: | |
# * One for processing individual pages, | |
# * One for shared functions between the main and child script. | |
# | |
# The script dependencies are vary with the book, but they usually include | |
# * ImageMagick (https://imagemagick.org/) | |
# * unpaper (https://github.com/unpaper/unpaper) | |
# * DjVuLibre (http://djvu.sourceforge.net/) | |
# * minidjvu-mid (https://github.com/trufanov-nok/minidjvu-mod) or the original but slower and less user friendly minidjvu (https://github.com/barak/minidjvu) | |
# * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) | |
# * ocrodjvu (https://github.com/jwilk/ocrodjvu) | |
# * Ghostscript (https://www.ghostscript.com/) | |
# * dpsprep (https://github.com/kcroker/dpsprep) | |
# | |
# I do not like to put licenses on my code, so consider this script Unlicensed (https://unlicense.org/). | |
set dependencies parallel convert unpaper minidjvu djvused ocrodjvu dpsprep | |
set workers 8 | |
set ocr_language eng | |
set page_range (seq 3 104) # We often only require a subset of all pages | |
set tmpdir (mktemp --directory --suffix=-build-scanned-book) | |
set target_djvu (basename (pwd)).djvu | |
set target_pdf (basename (pwd)).pdf | |
set src *.png | |
# Automatically run on exit | |
function finish --on-event fish_exit --on-signal TERM | |
read --local --prompt-str "Delete the temporary directory $tmpdir? (Y/n) " confirm | |
switch $confirm | |
case Y y '' | |
rm --recursive $tmpdir | |
end | |
end | |
echo " | |
function notify --description 'Accented echo' | |
echo (set_color --bold green)\">>> \$argv\"(set_color normal) | |
end | |
function run --description 'Print a command, run it and exit if it fails' | |
set escaped_argv (string escape -- \$argv) | |
echo (set_color cyan)\"Running \$escaped_argv\"(set_color normal) | |
eval \$escaped_argv | |
or exit 1 | |
end | |
" > $tmpdir/shared | |
source $tmpdir/shared | |
notify 'Verifying dependencies' | |
which $dependencies >/dev/null | |
or exit 1 | |
# This is a general-purpose subroutine for processing individual pages. It is the most likely component that would need to be modified. | |
# For example, if compression is too destructive, you might wish to skip unpaper and instead create, at each step, a grayscale DjVu file. | |
# This can be achieved by replacing .pbm with .pgm as the output file extension for ImageMagick, and then encoding it via | |
# c44 -dpi 72 \$output | |
# This would generate a file called \$output.djvu. | |
# Rather then using minidjvu to combine .pbm files, combining the idividual pages would be achieved via | |
# djvm -create $outfile $tmpdir/*.djvu | |
# Another option would be to skip DjVu entirely, set the output format to .pdf, and then combining the individual PDFs via | |
# gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dBATCH -dNOPAUSE -sOutputFile=$target_pdf $tmpdir/*.pdf | |
echo " | |
source $tmpdir/shared | |
set input \$argv[1] | |
set output $tmpdir/\$input.pbm | |
run convert -monochrome \$input \$output | |
run unpaper --overwrite \$output \$output | |
" > $tmpdir/process-page | |
notify 'Beginning batch processing' | |
echo $src[$page_range] | | |
string replace --all --regex ' ' '\n' | | |
parallel --halt now,fail=1 --jobs $workers --lb fish $tmpdir/process-page | |
or exit 1 | |
notify 'Building the DjVu book' | |
run minidjvu-mod --verbose $tmpdir/*.pbm $target_djvu | |
run djvused $target_djvu -e 'set-outline bookmarks.txt' -s | |
run ocrodjvu -l $ocr_language --in-place --on-error=resume $target_djvu | |
# dpsprep does a lot of work, so if we only care about the image content of the DjVu file, we may instead use | |
# convert $target_djvu $target_pdf | |
notify 'Converting the DjVu book to PDF' | |
run dpsprep $target_djvu $target_pdf | |
notify 'Success!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment