Skip to content

Instantly share code, notes, and snippets.

@oiva
Last active April 20, 2016 13:53
Show Gist options
  • Save oiva/6050045 to your computer and use it in GitHub Desktop.
Save oiva/6050045 to your computer and use it in GitHub Desktop.
PDF.py is a small script that converts scanned images to PDFs, reads the text from the PDF with Tesseract OCR, and adds the text to the PDF metadata keywords. Currently the script looks for images in the same directory and keeps the output PDF there too. Probably only works in OS X. If the text file exists already, the OCR is not run again. Ther…
"""
Find scanned images or PDFs in directory, run Tesseract OCR on them.
Generate PDF, import the text content into the PDF's metadata.
"""
#!/usr/bin/env python
import codecs
import os
import re
from distutils.spawn import find_executable
LANG = "fin"
INPUT_EXTENSION = ".pdf" # tested with .png and .pdf
OUTPUT_APPENDIX = "_ocr"
OUTPUT_DPI = 300
RUN_OCR = False
def handle_file(filepath):
""" produce metadata-filled PDF for given filename """
output = "%s%s.pdf" % (filename, OUTPUT_APPENDIX)
pdf = filename + ".pdf"
png = filename + ".png"
txt = filename + ".txt"
# check if file is old output file
if (OUTPUT_APPENDIX + extension) in filepath:
return
# check if output file exists already
try:
with open(output):
return
except IOError:
pass
if RUN_OCR:
run_ocr(filepath)
import_metadata(filepath)
# remove temporary files
try:
os.remove(txt)
except OSError:
pass
try:
if extension == ".pdf":
os.remove(png)
else:
os.remove(pdf)
except OSError:
pass
# remove original file? if the OCR'd version exists
try:
with open(output):
os.remove(filepath)
except IOError:
pass
def run_ocr(filepath):
"""
takes a file (either PDF or image), runs it trough OCR and produces a .txt
"""
txt = filename + ".txt"
# check if the text file exists already
try:
with open(txt):
return
except IOError:
pass
# convert PDF to PNGs for OCR
if extension == ".pdf":
png = filename + ".png"
print "convert %s to %s-*" % (filepath, filename)
# ghostscript produces one PNG for page which are then combined with convert
os.system("gs -sDEVICE=pngalpha -sOutputFile=%s-%%03d.png -dLastPage=100\
-r300 -dNOPAUSE -dBATCH -q %s -c quit" % (filename, filepath))
os.system("convert -append %s-*.png %s" % (filename, png))
os.system("rm %s-*.png" % filename)
ocr_input = png
else:
ocr_input = filepath
# run OCR on original file, result goes to text file
print "OCR %s to %s" % (ocr_input, txt)
# .txt is added to output filename by tesseract
os.system("tesseract -l %s %s %s" % (LANG, ocr_input, filename))
def import_metadata(filepath):
"""
Reads a text file with same file name as input, generates PDF from input if
necessary, and imports the text into the PDF metadata
"""
txt = filename + ".txt"
pdf = filename + ".pdf"
output = filename + "%s.pdf" % OUTPUT_APPENDIX
try:
text_file = open(txt, "r+")
except IOError:
# text file not found
return
# if input is image, generate PDF
if extension != ".pdf":
print "convert %s to %s" % (filepath, pdf)
os.system("convert -density %s %s %s" % (OUTPUT_DPI, filepath, pdf))
# Prepend OCR results with PDF meta data keys, write back to text file.
# Remove newlines and commas from text, so that whole text is one PDF
# metadata key / value
ocr_text = text_file.read()
text_file.close()
# split text into lines on every newline
ocr_text = ocr_text.replace(',', '\\,')
ocr_text = re.split(r"[\n\r]+", ocr_text)
# remove empty strings
ocr_text = [x.strip() for x in ocr_text]
ocr_text = filter(None, ocr_text)
# PDF metadata wants either ascii or UTF-16-BE
ocr_text = ", ".join(ocr_text)
ocr_text = to_unicode(ocr_text)
meta = "[ /Keywords %s\n /DOCINFO pdfmark" % ocr_text
text_file = open(txt, "w")
text_file.write(meta)
text_file.close()
# use PDF toolkit to import metadata from text file to PDF
print "update PDF metadata with %s" % txt
os.system("gs -q -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile=%s %s %s"\
% (output, pdf, txt))
def to_unicode(string):
"""converts string into UTF-16-BE format which works in PDF metadata"""
string = string.decode('utf_8')
try:
ascii = string.encode('ascii')
except UnicodeEncodeError:
bytes = codecs.BOM_UTF16_BE + string.encode('utf_16_be')
return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in bytes))
else:
# remove characters that are used as PDF metadata control characters
for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), ('\t', '\\t')]:
string = string.replace(a, b)
return '({})'.format(string)
# check if Tesseract, Imagemagick and Ghostscript exist
TESSERACT = find_executable("tesseract")
CONVERT = find_executable("convert")
GS = find_executable("gs")
if TESSERACT is None:
print "This script requires Tesseract, aborting. Install Tesseract with:\n\
brew install tesseract --all-languages"
exit(1)
if CONVERT is None:
print "This script requires Imagemagick, aborting. Install Imagemagick with:\
\nbrew install imagemagick"
exit(1)
if GS is None:
print "This script requires Ghostscript, aborting. Install Ghostscript\
with:\nbrew install gs"
exit(1)
print "Looking for %ss..." % INPUT_EXTENSION
for f in os.listdir("."):
if not os.path.isfile(f):
continue
filename, extension = os.path.splitext(f)
if extension != INPUT_EXTENSION:
continue
handle_file(f)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment