Skip to content

Instantly share code, notes, and snippets.

Last active April 7, 2018 17:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zopieux/71707bc63b5e9341e8db3629570287d7 to your computer and use it in GitHub Desktop.
Save zopieux/71707bc63b5e9341e8db3629570287d7 to your computer and use it in GitHub Desktop.
Google OCR to PDF invisible overlay
#!/usr/bin/env python3
Example usage:
$ pip install reportlab pillow
$ ./ these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf
import argparse
import json
from collections import namedtuple
from pathlib import Path
from PIL import Image, ImageSequence
from reportlab.lib import pagesizes
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfgen import canvas
TextAnnotation = namedtuple('TextAnnotation', 'content x y width height')
def parse_google_ocr(data):
for a in data['responses'][0]['textAnnotations']:
poly = a['boundingPoly']['vertices']
x, y = poly[0]['x'], poly[0]['y']
width = poly[2]['x'] - x
height = poly[2]['y'] - y
if height > 200:
yield TextAnnotation(a['description'], x, y, width, height)
if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('-p', '--pages', action='append',
help="pages to use (1-indexed)")
p.add_argument('tiff', type=argparse.FileType('rb'))
p.add_argument('output', type=argparse.FileType('wb'))
args = p.parse_args()
tiff =
tiff_pages = ImageSequence.Iterator(tiff)
print(f"Page count: {tiff.n_frames}")
pages = set(range(tiff.n_frames))
if args.pages:
wanted = set()
for p in args.pages:
for p in p.split(','):
if p.isnumeric():
wanted.add(int(p) - 1)
elif p.count('-') == 1:
f, t = p.split('-')
wanted.update(range(int(f) - 1, int(t)))
pages &= wanted
width, height = pagesize = pagesizes.A4
c = canvas.Canvas(args.output, pagesize=pagesize)
for page in sorted(pages):
p = tiff_pages[page]
print(f"Page {page + 1}: size {p.size}")
print("\tdrawing background image")
c.drawInlineImage(p, 0, 0, width=width, height=height,
jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1}))
with as jsondata:
texts = list(parse_google_ocr(json.load(jsondata)))
except (FileNotFoundError, KeyError):
texts = []
scale = 2 * width / p.size[0] # ???
print(f"\tdrawing {len(texts)} texts, scale: {scale}")
text_writer = c.beginText()
for text in texts:
font_size = 9
while stringWidth(text.content, text_writer._fontname,
font_size) < text.width * scale * 0.95:
font_size += 0.5
text_writer.setFont(text_writer._fontname, font_size)
text_writer.setTextOrigin(text.x * scale,
height - text.y * scale - 18)
# finish page
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment