oiva/pdf.py

## pdf.py
"""
Find scanned images or PDFs in directory, run Tesseract OCR on them.
Generate PDF, import the text content into the PDF's metadata.
"""

#!/usr/bin/env python
import codecs
import os
import re
from distutils.spawn import find_executable

LANG = "fin"
INPUT_EXTENSION = ".pdf" # tested with .png and .pdf
OUTPUT_APPENDIX = "_ocr"
OUTPUT_DPI = 300
RUN_OCR = False

def handle_file(filepath):
  """ produce metadata-filled PDF for given filename """

  output = "%s%s.pdf" % (filename, OUTPUT_APPENDIX)
  pdf = filename + ".pdf"
  png = filename + ".png"
  txt = filename + ".txt"

  # check if file is old output file
  if (OUTPUT_APPENDIX + extension) in filepath:
    return

  # check if output file exists already
  try:
    with open(output):
      return
  except IOError:
    pass

  if RUN_OCR:
    run_ocr(filepath)
  import_metadata(filepath)

  # remove temporary files
  try:
    os.remove(txt)
  except OSError:
    pass

  try:
    if extension == ".pdf":
      os.remove(png)
    else:
      os.remove(pdf)
  except OSError:
    pass

  # remove original file? if the OCR'd version exists
  try:
    with open(output):
      os.remove(filepath)
  except IOError:
    pass


def run_ocr(filepath):
  """
  takes a file (either PDF or image), runs it trough OCR and produces a .txt
  """
  txt = filename + ".txt"

  # check if the text file exists already
  try:
    with open(txt):
      return
  except IOError:
    pass

  # convert PDF to PNGs for OCR
  if extension == ".pdf":
    png = filename + ".png"
    print "convert %s to %s-*" % (filepath, filename)

    # ghostscript produces one PNG for page which are then combined with convert
    os.system("gs -sDEVICE=pngalpha -sOutputFile=%s-%%03d.png -dLastPage=100\
      -r300 -dNOPAUSE -dBATCH -q %s -c quit" % (filename, filepath))
    os.system("convert -append %s-*.png %s" % (filename, png))
    os.system("rm %s-*.png" % filename)
    ocr_input = png
  else:
    ocr_input = filepath

  # run OCR on original file, result goes to text file
  print "OCR %s to %s" % (ocr_input, txt)
  # .txt is added to output filename by tesseract
  os.system("tesseract -l %s %s %s" % (LANG, ocr_input, filename))


def import_metadata(filepath):
  """
  Reads a text file with same file name as input, generates PDF from input if
  necessary, and imports the text into the PDF metadata
  """
  txt = filename + ".txt"
  pdf = filename + ".pdf"
  output = filename + "%s.pdf" % OUTPUT_APPENDIX

  try:
    text_file = open(txt, "r+")
  except IOError:
    # text file not found
    return

  # if input is image, generate PDF
  if extension != ".pdf":
    print "convert %s to %s" % (filepath, pdf)
    os.system("convert -density %s %s %s" % (OUTPUT_DPI, filepath, pdf))

  # Prepend OCR results with PDF meta data keys, write back to text file.
  # Remove newlines and commas from text, so that whole text is one PDF
  # metadata key / value
  ocr_text = text_file.read()
  text_file.close()

  # split text into lines on every newline
  ocr_text = ocr_text.replace(',', '\\,')
  ocr_text = re.split(r"[\n\r]+", ocr_text)

  # remove empty strings
  ocr_text = [x.strip() for x in ocr_text]
  ocr_text = filter(None, ocr_text)

  # PDF metadata wants either ascii or UTF-16-BE
  ocr_text = ", ".join(ocr_text)
  ocr_text = to_unicode(ocr_text)

  meta = "[ /Keywords %s\n  /DOCINFO pdfmark" % ocr_text

  text_file = open(txt, "w")
  text_file.write(meta)
  text_file.close()

  # use PDF toolkit to import metadata from text file to PDF
  print "update PDF metadata with %s" % txt
  os.system("gs -q -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile=%s %s %s"\
    % (output, pdf, txt))


def to_unicode(string):
  """converts string into UTF-16-BE format which works in PDF metadata"""
  string = string.decode('utf_8')
  try:
    ascii = string.encode('ascii')
  except UnicodeEncodeError:
    bytes = codecs.BOM_UTF16_BE + string.encode('utf_16_be')
    return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in bytes))
  else:
    # remove characters that are used as PDF metadata control characters
    for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), ('\t', '\\t')]:
      string = string.replace(a, b)
    return '({})'.format(string)


# check if Tesseract, Imagemagick and Ghostscript exist
TESSERACT = find_executable("tesseract")
CONVERT = find_executable("convert")
GS = find_executable("gs")

if TESSERACT is None:
  print "This script requires Tesseract, aborting. Install Tesseract with:\n\
    brew install tesseract  --all-languages"
  exit(1)

if CONVERT is None:
  print "This script requires Imagemagick, aborting. Install Imagemagick with:\
    \nbrew install imagemagick"
  exit(1)

if GS is None:
  print "This script requires Ghostscript, aborting. Install Ghostscript\
    with:\nbrew install gs"
  exit(1)

print "Looking for %ss..." % INPUT_EXTENSION
for f in os.listdir("."):
  if not os.path.isfile(f):
    continue

  filename, extension = os.path.splitext(f)
  if extension != INPUT_EXTENSION:
    continue

  handle_file(f)

exit(0)
	"""
	Find scanned images or PDFs in directory, run Tesseract OCR on them.
	Generate PDF, import the text content into the PDF's metadata.
	"""

	#!/usr/bin/env python
	import codecs
	import os
	import re
	from distutils.spawn import find_executable

	LANG = "fin"
	INPUT_EXTENSION = ".pdf" # tested with .png and .pdf
	OUTPUT_APPENDIX = "_ocr"
	OUTPUT_DPI = 300
	RUN_OCR = False

	def handle_file(filepath):
	""" produce metadata-filled PDF for given filename """

	output = "%s%s.pdf" % (filename, OUTPUT_APPENDIX)
	pdf = filename + ".pdf"
	png = filename + ".png"
	txt = filename + ".txt"

	# check if file is old output file
	if (OUTPUT_APPENDIX + extension) in filepath:
	return

	# check if output file exists already
	try:
	with open(output):
	return
	except IOError:
	pass

	if RUN_OCR:
	run_ocr(filepath)
	import_metadata(filepath)

	# remove temporary files
	try:
	os.remove(txt)
	except OSError:
	pass

	try:
	if extension == ".pdf":
	os.remove(png)
	else:
	os.remove(pdf)
	except OSError:
	pass

	# remove original file? if the OCR'd version exists
	try:
	with open(output):
	os.remove(filepath)
	except IOError:
	pass


	def run_ocr(filepath):
	"""
	takes a file (either PDF or image), runs it trough OCR and produces a .txt
	"""
	txt = filename + ".txt"

	# check if the text file exists already
	try:
	with open(txt):
	return
	except IOError:
	pass

	# convert PDF to PNGs for OCR
	if extension == ".pdf":
	png = filename + ".png"
	print "convert %s to %s-*" % (filepath, filename)

	# ghostscript produces one PNG for page which are then combined with convert
	os.system("gs -sDEVICE=pngalpha -sOutputFile=%s-%%03d.png -dLastPage=100\
	-r300 -dNOPAUSE -dBATCH -q %s -c quit" % (filename, filepath))
	os.system("convert -append %s-*.png %s" % (filename, png))
	os.system("rm %s-*.png" % filename)
	ocr_input = png
	else:
	ocr_input = filepath

	# run OCR on original file, result goes to text file
	print "OCR %s to %s" % (ocr_input, txt)
	# .txt is added to output filename by tesseract
	os.system("tesseract -l %s %s %s" % (LANG, ocr_input, filename))


	def import_metadata(filepath):
	"""
	Reads a text file with same file name as input, generates PDF from input if
	necessary, and imports the text into the PDF metadata
	"""
	txt = filename + ".txt"
	pdf = filename + ".pdf"
	output = filename + "%s.pdf" % OUTPUT_APPENDIX

	try:
	text_file = open(txt, "r+")
	except IOError:
	# text file not found
	return

	# if input is image, generate PDF
	if extension != ".pdf":
	print "convert %s to %s" % (filepath, pdf)
	os.system("convert -density %s %s %s" % (OUTPUT_DPI, filepath, pdf))

	# Prepend OCR results with PDF meta data keys, write back to text file.
	# Remove newlines and commas from text, so that whole text is one PDF
	# metadata key / value
	ocr_text = text_file.read()
	text_file.close()

	# split text into lines on every newline
	ocr_text = ocr_text.replace(',', '\\,')
	ocr_text = re.split(r"[\n\r]+", ocr_text)

	# remove empty strings
	ocr_text = [x.strip() for x in ocr_text]
	ocr_text = filter(None, ocr_text)

	# PDF metadata wants either ascii or UTF-16-BE
	ocr_text = ", ".join(ocr_text)
	ocr_text = to_unicode(ocr_text)

	meta = "[ /Keywords %s\n /DOCINFO pdfmark" % ocr_text

	text_file = open(txt, "w")
	text_file.write(meta)
	text_file.close()

	# use PDF toolkit to import metadata from text file to PDF
	print "update PDF metadata with %s" % txt
	os.system("gs -q -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile=%s %s %s"\
	% (output, pdf, txt))


	def to_unicode(string):
	"""converts string into UTF-16-BE format which works in PDF metadata"""
	string = string.decode('utf_8')
	try:
	ascii = string.encode('ascii')
	except UnicodeEncodeError:
	bytes = codecs.BOM_UTF16_BE + string.encode('utf_16_be')
	return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in bytes))
	else:
	# remove characters that are used as PDF metadata control characters
	for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), ('\t', '\\t')]:
	string = string.replace(a, b)
	return '({})'.format(string)


	# check if Tesseract, Imagemagick and Ghostscript exist
	TESSERACT = find_executable("tesseract")
	CONVERT = find_executable("convert")
	GS = find_executable("gs")

	if TESSERACT is None:
	print "This script requires Tesseract, aborting. Install Tesseract with:\n\
	brew install tesseract --all-languages"
	exit(1)

	if CONVERT is None:
	print "This script requires Imagemagick, aborting. Install Imagemagick with:\
	\nbrew install imagemagick"
	exit(1)

	if GS is None:
	print "This script requires Ghostscript, aborting. Install Ghostscript\
	with:\nbrew install gs"
	exit(1)

	print "Looking for %ss..." % INPUT_EXTENSION
	for f in os.listdir("."):
	if not os.path.isfile(f):
	continue

	filename, extension = os.path.splitext(f)
	if extension != INPUT_EXTENSION:
	continue

	handle_file(f)

	exit(0)