patrickmclaren/pocr.py

## pocr.py
#!/usr/bin/python

import subprocess
import os
import sys

import argparse
from argparse import RawTextHelpFormatter

FNULL = open(os.devnull, 'w')

def init():
    """Ensure that the system can satisfy both executable and
    module dependencies. If all dependencies are satisfied, then
    load the required modules, if not, exit."""

    reqs = ["tesseract", "hocr2pdf"]
    libs = ["PyPDF2"]

    dependencies_satisfied = True

    for ex in reqs:
        try:
            subprocess.check_call(["which", ex], stdout=FNULL, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as e:
            dependencies_satisfied = False

            err = "Cannot find executable \t'{}'".format(ex)
            print(err)

    for lib in libs:
        try:
            mod = __import__(lib)

            imported_lib = {
                "{}".format(lib) : mod
            }

            globals().update(imported_lib)
        except ImportError as e:
            dependencies_satisfied = False

            err = "Cannot find library \t'{}'".format(lib)
            print(err)

    if dependencies_satisfied == False:
        sys.exit(-1)

    return

def parse_args():
    """Parse args to find filename of PDF to process."""
    overview = "Add OCR to scanned documents in PDF format."
    parser = argparse.ArgumentParser(description=overview,
                                     formatter_class=RawTextHelpFormatter)

    file_help = "PDF file to scan"
    parser.add_argument('file', metavar='FILE', nargs=1, help=file_help)

    pargs = parser.parse_args()

    return pargs

def process_pdf(filename):
    """Split PDF by pages, convert to images, compute OCR,
    add OCR text to individual pages, then finally build document."""
    basename = os.path.splitext(os.path.basename(filename))[0]
    tmp_files = []

    with open(filename, 'rb') as f:
        reader = PyPDF2.PdfFileReader(f)
        num_pages = reader.getNumPages()

        for n in range(num_pages):
            tmp_filename = "{}-{}.pdf".format(basename, n)
            tmp_files.append(tmp_filename)

            with open(tmp_filename, 'wb') as g:
                writer = PyPDF2.PdfFileWriter()
                writer.addPage(reader.getPage(n))
                writer.write(g)

    subprocess.check_call(["rm", filename], stdout=FNULL, stderr=subprocess.STDOUT)

    hocr_files = []
    for i in tmp_files:
        tmp_basename = os.path.splitext(os.path.basename(i))[0]
        image_name = "{}.jpeg".format(tmp_basename)
        hocr_name = "{}.html".format(tmp_basename)

        subprocess.check_call(["convert", "-density", "300", "-quality", "100", i, image_name], stdout=FNULL, stderr=subprocess.STDOUT)
        subprocess.check_call(["tesseract", image_name, tmp_basename, "hocr"], stdout=FNULL, stderr=subprocess.STDOUT)

        with open(hocr_name, 'rb') as h:
            subprocess.check_call(["hocr2pdf", "-i", image_name, "-o", i], stdin=h, stdout=FNULL, stderr=subprocess.STDOUT)

        files_to_remove = [image_name, hocr_name]
        for unneeded in files_to_remove:
            subprocess.check_call(["rm", unneeded], stdout=FNULL, stderr=subprocess.STDOUT)

    with open(filename, 'wb') as f:
        writer = PyPDF2.PdfFileWriter()

        for i in tmp_files:
            with open(i, 'rb') as tmp:
                reader = PyPDF2.PdfFileReader(tmp)
                page = reader.getPage(0)

                writer.addPage(page)
                writer.write(f)

            subprocess.check_call(["rm", i], stdout=FNULL, stderr=subprocess.STDOUT)

    print("Successfully added text to PDF")

def main():
    """Main function, everything starts from here."""
    args = parse_args()

    init()

    process_pdf(args.file[0])

if __name__ == "__main__":
    main()
	#!/usr/bin/python

	import subprocess
	import os
	import sys

	import argparse
	from argparse import RawTextHelpFormatter

	FNULL = open(os.devnull, 'w')

	def init():
	"""Ensure that the system can satisfy both executable and
	module dependencies. If all dependencies are satisfied, then
	load the required modules, if not, exit."""

	reqs = ["tesseract", "hocr2pdf"]
	libs = ["PyPDF2"]

	dependencies_satisfied = True

	for ex in reqs:
	try:
	subprocess.check_call(["which", ex], stdout=FNULL, stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError as e:
	dependencies_satisfied = False

	err = "Cannot find executable \t'{}'".format(ex)
	print(err)

	for lib in libs:
	try:
	mod = __import__(lib)

	imported_lib = {
	"{}".format(lib) : mod
	}

	globals().update(imported_lib)
	except ImportError as e:
	dependencies_satisfied = False

	err = "Cannot find library \t'{}'".format(lib)
	print(err)

	if dependencies_satisfied == False:
	sys.exit(-1)

	return

	def parse_args():
	"""Parse args to find filename of PDF to process."""
	overview = "Add OCR to scanned documents in PDF format."
	parser = argparse.ArgumentParser(description=overview,
	formatter_class=RawTextHelpFormatter)

	file_help = "PDF file to scan"
	parser.add_argument('file', metavar='FILE', nargs=1, help=file_help)

	pargs = parser.parse_args()

	return pargs

	def process_pdf(filename):
	"""Split PDF by pages, convert to images, compute OCR,
	add OCR text to individual pages, then finally build document."""
	basename = os.path.splitext(os.path.basename(filename))[0]
	tmp_files = []

	with open(filename, 'rb') as f:
	reader = PyPDF2.PdfFileReader(f)
	num_pages = reader.getNumPages()

	for n in range(num_pages):
	tmp_filename = "{}-{}.pdf".format(basename, n)
	tmp_files.append(tmp_filename)

	with open(tmp_filename, 'wb') as g:
	writer = PyPDF2.PdfFileWriter()
	writer.addPage(reader.getPage(n))
	writer.write(g)

	subprocess.check_call(["rm", filename], stdout=FNULL, stderr=subprocess.STDOUT)

	hocr_files = []
	for i in tmp_files:
	tmp_basename = os.path.splitext(os.path.basename(i))[0]
	image_name = "{}.jpeg".format(tmp_basename)
	hocr_name = "{}.html".format(tmp_basename)

	subprocess.check_call(["convert", "-density", "300", "-quality", "100", i, image_name], stdout=FNULL, stderr=subprocess.STDOUT)
	subprocess.check_call(["tesseract", image_name, tmp_basename, "hocr"], stdout=FNULL, stderr=subprocess.STDOUT)

	with open(hocr_name, 'rb') as h:
	subprocess.check_call(["hocr2pdf", "-i", image_name, "-o", i], stdin=h, stdout=FNULL, stderr=subprocess.STDOUT)

	files_to_remove = [image_name, hocr_name]
	for unneeded in files_to_remove:
	subprocess.check_call(["rm", unneeded], stdout=FNULL, stderr=subprocess.STDOUT)

	with open(filename, 'wb') as f:
	writer = PyPDF2.PdfFileWriter()

	for i in tmp_files:
	with open(i, 'rb') as tmp:
	reader = PyPDF2.PdfFileReader(tmp)
	page = reader.getPage(0)

	writer.addPage(page)
	writer.write(f)

	subprocess.check_call(["rm", i], stdout=FNULL, stderr=subprocess.STDOUT)

	print("Successfully added text to PDF")

	def main():
	"""Main function, everything starts from here."""
	args = parse_args()

	init()

	process_pdf(args.file[0])

	if __name__ == "__main__":
	main()