Last active
December 21, 2015 15:59
-
-
Save patrickmclaren/6330346 to your computer and use it in GitHub Desktop.
Embed OCR text in PDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import subprocess | |
import os | |
import sys | |
import argparse | |
from argparse import RawTextHelpFormatter | |
FNULL = open(os.devnull, 'w') | |
def init(): | |
"""Ensure that the system can satisfy both executable and | |
module dependencies. If all dependencies are satisfied, then | |
load the required modules, if not, exit.""" | |
reqs = ["tesseract", "hocr2pdf"] | |
libs = ["PyPDF2"] | |
dependencies_satisfied = True | |
for ex in reqs: | |
try: | |
subprocess.check_call(["which", ex], stdout=FNULL, stderr=subprocess.STDOUT) | |
except subprocess.CalledProcessError as e: | |
dependencies_satisfied = False | |
err = "Cannot find executable \t'{}'".format(ex) | |
print(err) | |
for lib in libs: | |
try: | |
mod = __import__(lib) | |
imported_lib = { | |
"{}".format(lib) : mod | |
} | |
globals().update(imported_lib) | |
except ImportError as e: | |
dependencies_satisfied = False | |
err = "Cannot find library \t'{}'".format(lib) | |
print(err) | |
if dependencies_satisfied == False: | |
sys.exit(-1) | |
return | |
def parse_args(): | |
"""Parse args to find filename of PDF to process.""" | |
overview = "Add OCR to scanned documents in PDF format." | |
parser = argparse.ArgumentParser(description=overview, | |
formatter_class=RawTextHelpFormatter) | |
file_help = "PDF file to scan" | |
parser.add_argument('file', metavar='FILE', nargs=1, help=file_help) | |
pargs = parser.parse_args() | |
return pargs | |
def process_pdf(filename): | |
"""Split PDF by pages, convert to images, compute OCR, | |
add OCR text to individual pages, then finally build document.""" | |
basename = os.path.splitext(os.path.basename(filename))[0] | |
tmp_files = [] | |
with open(filename, 'rb') as f: | |
reader = PyPDF2.PdfFileReader(f) | |
num_pages = reader.getNumPages() | |
for n in range(num_pages): | |
tmp_filename = "{}-{}.pdf".format(basename, n) | |
tmp_files.append(tmp_filename) | |
with open(tmp_filename, 'wb') as g: | |
writer = PyPDF2.PdfFileWriter() | |
writer.addPage(reader.getPage(n)) | |
writer.write(g) | |
subprocess.check_call(["rm", filename], stdout=FNULL, stderr=subprocess.STDOUT) | |
hocr_files = [] | |
for i in tmp_files: | |
tmp_basename = os.path.splitext(os.path.basename(i))[0] | |
image_name = "{}.jpeg".format(tmp_basename) | |
hocr_name = "{}.html".format(tmp_basename) | |
subprocess.check_call(["convert", "-density", "300", "-quality", "100", i, image_name], stdout=FNULL, stderr=subprocess.STDOUT) | |
subprocess.check_call(["tesseract", image_name, tmp_basename, "hocr"], stdout=FNULL, stderr=subprocess.STDOUT) | |
with open(hocr_name, 'rb') as h: | |
subprocess.check_call(["hocr2pdf", "-i", image_name, "-o", i], stdin=h, stdout=FNULL, stderr=subprocess.STDOUT) | |
files_to_remove = [image_name, hocr_name] | |
for unneeded in files_to_remove: | |
subprocess.check_call(["rm", unneeded], stdout=FNULL, stderr=subprocess.STDOUT) | |
with open(filename, 'wb') as f: | |
writer = PyPDF2.PdfFileWriter() | |
for i in tmp_files: | |
with open(i, 'rb') as tmp: | |
reader = PyPDF2.PdfFileReader(tmp) | |
page = reader.getPage(0) | |
writer.addPage(page) | |
writer.write(f) | |
subprocess.check_call(["rm", i], stdout=FNULL, stderr=subprocess.STDOUT) | |
print("Successfully added text to PDF") | |
def main(): | |
"""Main function, everything starts from here.""" | |
args = parse_args() | |
init() | |
process_pdf(args.file[0]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment