zopieux/ocr-pdf.py

## ocr-pdf.py
#!/usr/bin/env python3

"""
Example usage:
$ pip install reportlab pillow
$ ./overlay.py these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf
"""

import argparse
import json
from collections import namedtuple
from pathlib import Path

from PIL import Image, ImageSequence
from reportlab.lib import pagesizes
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfgen import canvas

TextAnnotation = namedtuple('TextAnnotation', 'content x y width height')


def parse_google_ocr(data):
    for a in data['responses'][0]['textAnnotations']:
        poly = a['boundingPoly']['vertices']
        x, y = poly[0]['x'], poly[0]['y']
        width = poly[2]['x'] - x
        height = poly[2]['y'] - y
        if height > 200:
            continue
        yield TextAnnotation(a['description'], x, y, width, height)


if __name__ == '__main__':
    p = argparse.ArgumentParser()
    p.add_argument('-p', '--pages', action='append',
                   help="pages to use (1-indexed)")
    p.add_argument('tiff', type=argparse.FileType('rb'))
    p.add_argument('jsonpattern')
    p.add_argument('output', type=argparse.FileType('wb'))

    args = p.parse_args()

    tiff = Image.open(args.tiff)
    tiff_pages = ImageSequence.Iterator(tiff)

    print(f"Page count: {tiff.n_frames}")

    pages = set(range(tiff.n_frames))
    if args.pages:
        wanted = set()
        for p in args.pages:
            for p in p.split(','):
                if p.isnumeric():
                    wanted.add(int(p) - 1)
                elif p.count('-') == 1:
                    f, t = p.split('-')
                    wanted.update(range(int(f) - 1, int(t)))
        pages &= wanted

    width, height = pagesize = pagesizes.A4
    c = canvas.Canvas(args.output, pagesize=pagesize)

    for page in sorted(pages):
        p = tiff_pages[page]
        print(f"Page {page + 1}: size {p.size}")

        print("\tdrawing background image")
        c.drawInlineImage(p, 0, 0, width=width, height=height,
                          preserveAspectRatio=True)

        try:
            jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1}))
            with jsonfile.open() as jsondata:
                texts = list(parse_google_ocr(json.load(jsondata)))
        except (FileNotFoundError, KeyError):
            texts = []

        scale = 2 * width / p.size[0]  # ???
        print(f"\tdrawing {len(texts)} texts, scale: {scale}")
        text_writer = c.beginText()
        text_writer.setTextRenderMode(3)
        for text in texts:
            font_size = 9
            while stringWidth(text.content, text_writer._fontname,
                              font_size) < text.width * scale * 0.95:
                font_size += 0.5
            text_writer.setFont(text_writer._fontname, font_size)
            text_writer.setTextOrigin(text.x * scale,
                                      height - text.y * scale - 18)
            text_writer.textOut(text.content)
        c.drawText(text_writer)

        # finish page
        c.showPage()

    c.save()
	#!/usr/bin/env python3

	"""
	Example usage:
	$ pip install reportlab pillow
	$ ./overlay.py these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf
	"""

	import argparse
	import json
	from collections import namedtuple
	from pathlib import Path

	from PIL import Image, ImageSequence
	from reportlab.lib import pagesizes
	from reportlab.pdfbase.pdfmetrics import stringWidth
	from reportlab.pdfgen import canvas

	TextAnnotation = namedtuple('TextAnnotation', 'content x y width height')


	def parse_google_ocr(data):
	for a in data['responses'][0]['textAnnotations']:
	poly = a['boundingPoly']['vertices']
	x, y = poly[0]['x'], poly[0]['y']
	width = poly[2]['x'] - x
	height = poly[2]['y'] - y
	if height > 200:
	continue
	yield TextAnnotation(a['description'], x, y, width, height)


	if __name__ == '__main__':
	p = argparse.ArgumentParser()
	p.add_argument('-p', '--pages', action='append',
	help="pages to use (1-indexed)")
	p.add_argument('tiff', type=argparse.FileType('rb'))
	p.add_argument('jsonpattern')
	p.add_argument('output', type=argparse.FileType('wb'))

	args = p.parse_args()

	tiff = Image.open(args.tiff)
	tiff_pages = ImageSequence.Iterator(tiff)

	print(f"Page count: {tiff.n_frames}")

	pages = set(range(tiff.n_frames))
	if args.pages:
	wanted = set()
	for p in args.pages:
	for p in p.split(','):
	if p.isnumeric():
	wanted.add(int(p) - 1)
	elif p.count('-') == 1:
	f, t = p.split('-')
	wanted.update(range(int(f) - 1, int(t)))
	pages &= wanted

	width, height = pagesize = pagesizes.A4
	c = canvas.Canvas(args.output, pagesize=pagesize)

	for page in sorted(pages):
	p = tiff_pages[page]
	print(f"Page {page + 1}: size {p.size}")

	print("\tdrawing background image")
	c.drawInlineImage(p, 0, 0, width=width, height=height,
	preserveAspectRatio=True)

	try:
	jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1}))
	with jsonfile.open() as jsondata:
	texts = list(parse_google_ocr(json.load(jsondata)))
	except (FileNotFoundError, KeyError):
	texts = []

	scale = 2 * width / p.size[0] # ???
	print(f"\tdrawing {len(texts)} texts, scale: {scale}")
	text_writer = c.beginText()
	text_writer.setTextRenderMode(3)
	for text in texts:
	font_size = 9
	while stringWidth(text.content, text_writer._fontname,
	font_size) < text.width * scale * 0.95:
	font_size += 0.5
	text_writer.setFont(text_writer._fontname, font_size)
	text_writer.setTextOrigin(text.x * scale,
	height - text.y * scale - 18)
	text_writer.textOut(text.content)
	c.drawText(text_writer)

	# finish page
	c.showPage()

	c.save()