jburon/ocrpdf.sh

## ocrpdf.sh
#!/bin/sh

# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
# Hacked together using tips from these websites:
#      http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
#      http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
# Dependencies: pdftk, tesseract, imagemagick, hocr2pdf

cp $1 $1.bak
pdftk $1 burst output tesspage_%02d.pdf
for file in `ls tesspage*`
do
    PAGE=$(basename "$file" .pdf)
  # Convert the PDF page into a TIFF file
    convert -monochrome -density 600 $file "$PAGE".tif
  # OCR the TIFF file and save text to output.txt
    tesseract "$PAGE".tif output hocr
  # Turn text file outputed by tesseract into a PDF, then put it in background of original page
  #  enscript output.txt -B -o - | ps2pdf - output.pdf && pdftk $file background output.pdf output new-"$file"
    hocr2pdf -i "$PAGE".tif -o new-"$PAGE".pdf < output.html
  # Clean up
    rm output*
    rm "$file"
    rm *.tif
done
pdftk new* cat output $1
	#!/bin/sh

	# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
	# Hacked together using tips from these websites:
	# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
	# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
	# Dependencies: pdftk, tesseract, imagemagick, hocr2pdf

	cp $1 $1.bak
	pdftk $1 burst output tesspage_%02d.pdf
	for file in `ls tesspage*`
	do
	PAGE=$(basename "$file" .pdf)
	# Convert the PDF page into a TIFF file
	convert -monochrome -density 600 $file "$PAGE".tif
	# OCR the TIFF file and save text to output.txt
	tesseract "$PAGE".tif output hocr
	# Turn text file outputed by tesseract into a PDF, then put it in background of original page
	# enscript output.txt -B -o - \| ps2pdf - output.pdf && pdftk $file background output.pdf output new-"$file"
	hocr2pdf -i "$PAGE".tif -o new-"$PAGE".pdf < output.html
	# Clean up
	rm output*
	rm "$file"
	rm *.tif
	done
	pdftk new* cat output $1