Created Feb 9, 2014
pdfocr - script to transform a PDF containing a scanned book into a searchable PDF
# This is a script to transform a PDF containing a scanned book into a searchable PDF.
# Based on previous script and many good tips by Konrad Voelkel:
# Depends on convert (ImageMagick), pdftk and hocr2pdf (ExactImage).
# $ sudo apt-get install imagemagick pdftk exactimage
# You also need at least one OCR software which can be either tesseract or cuneiform.
# $ sudo apt-get install tesseract-ocr
# $ sudo apt-get install cuneiform
# To install languages into tesseract do (e.g. for Portuguese):
# $ sudo apt-get install tesseract-ocr-por
echo "usage: ./ document.pdf ocr-sfw split lang author title"
# where ocr-sfw is either tesseract or cuneiform
# split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
# lang is a language as in "tesseract --list-langs" or "cuneiform -l".
# and author, title are used for the PDF metadata.
# usage example:
# ./ SomeFile.pdf tesseract 1 por "Some Author" "Some Title"
pdftk "$1" burst dont_ask
for f in pg_*.pdf
if [ "1" == "$3" ]; then
convert -normalize -density 300 -depth 8 -crop 50%x100% +repage $f "$f.png"
convert -normalize -density 300 -depth 8 $f "$f.png"
rm pg_*.pdf
for f in pg_*.png
if [ "tesseract" == "$2" ]; then
tesseract -l $4 -psm 1 $f $f hocr
elif [ "cuneiform" == "$2" ]; then
cuneiform -l $4 -f hocr -o "$f.html" $f
echo "$2 is not a valid OCR software."
hocr2pdf -i $f -r 300 -s -o "$f.pdf" < "$f.html"
pdftk pg_*.pdf cat output merged.pdf
pdftk merged.pdf update_info_utf8 doc_data.txt output merged+data.pdf
echo "InfoBegin" >
echo "InfoKey: Author" >>
echo "InfoValue: $5" >>
echo "InfoBegin" >>
echo "InfoKey: Title" >>
echo "InfoValue: $6" >>
echo "InfoBegin" >>
echo "InfoKey: Creator" >>
echo "InfoValue: PDF OCR scan script" >>
pdftk merged+data.pdf update_info_utf8 output "$in_filename-ocr.pdf"
rm -r doc_data.txt merged* pg_*
