Skip to content

Instantly share code, notes, and snippets.

@patrickmclaren
Last active December 21, 2015 15:59
Show Gist options
  • Save patrickmclaren/6330346 to your computer and use it in GitHub Desktop.
Save patrickmclaren/6330346 to your computer and use it in GitHub Desktop.
Embed OCR text in PDF.
#!/usr/bin/python
import subprocess
import os
import sys
import argparse
from argparse import RawTextHelpFormatter
FNULL = open(os.devnull, 'w')
def init():
"""Ensure that the system can satisfy both executable and
module dependencies. If all dependencies are satisfied, then
load the required modules, if not, exit."""
reqs = ["tesseract", "hocr2pdf"]
libs = ["PyPDF2"]
dependencies_satisfied = True
for ex in reqs:
try:
subprocess.check_call(["which", ex], stdout=FNULL, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
dependencies_satisfied = False
err = "Cannot find executable \t'{}'".format(ex)
print(err)
for lib in libs:
try:
mod = __import__(lib)
imported_lib = {
"{}".format(lib) : mod
}
globals().update(imported_lib)
except ImportError as e:
dependencies_satisfied = False
err = "Cannot find library \t'{}'".format(lib)
print(err)
if dependencies_satisfied == False:
sys.exit(-1)
return
def parse_args():
"""Parse args to find filename of PDF to process."""
overview = "Add OCR to scanned documents in PDF format."
parser = argparse.ArgumentParser(description=overview,
formatter_class=RawTextHelpFormatter)
file_help = "PDF file to scan"
parser.add_argument('file', metavar='FILE', nargs=1, help=file_help)
pargs = parser.parse_args()
return pargs
def process_pdf(filename):
"""Split PDF by pages, convert to images, compute OCR,
add OCR text to individual pages, then finally build document."""
basename = os.path.splitext(os.path.basename(filename))[0]
tmp_files = []
with open(filename, 'rb') as f:
reader = PyPDF2.PdfFileReader(f)
num_pages = reader.getNumPages()
for n in range(num_pages):
tmp_filename = "{}-{}.pdf".format(basename, n)
tmp_files.append(tmp_filename)
with open(tmp_filename, 'wb') as g:
writer = PyPDF2.PdfFileWriter()
writer.addPage(reader.getPage(n))
writer.write(g)
subprocess.check_call(["rm", filename], stdout=FNULL, stderr=subprocess.STDOUT)
hocr_files = []
for i in tmp_files:
tmp_basename = os.path.splitext(os.path.basename(i))[0]
image_name = "{}.jpeg".format(tmp_basename)
hocr_name = "{}.html".format(tmp_basename)
subprocess.check_call(["convert", "-density", "300", "-quality", "100", i, image_name], stdout=FNULL, stderr=subprocess.STDOUT)
subprocess.check_call(["tesseract", image_name, tmp_basename, "hocr"], stdout=FNULL, stderr=subprocess.STDOUT)
with open(hocr_name, 'rb') as h:
subprocess.check_call(["hocr2pdf", "-i", image_name, "-o", i], stdin=h, stdout=FNULL, stderr=subprocess.STDOUT)
files_to_remove = [image_name, hocr_name]
for unneeded in files_to_remove:
subprocess.check_call(["rm", unneeded], stdout=FNULL, stderr=subprocess.STDOUT)
with open(filename, 'wb') as f:
writer = PyPDF2.PdfFileWriter()
for i in tmp_files:
with open(i, 'rb') as tmp:
reader = PyPDF2.PdfFileReader(tmp)
page = reader.getPage(0)
writer.addPage(page)
writer.write(f)
subprocess.check_call(["rm", i], stdout=FNULL, stderr=subprocess.STDOUT)
print("Successfully added text to PDF")
def main():
"""Main function, everything starts from here."""
args = parse_args()
init()
process_pdf(args.file[0])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment