Skip to content

Instantly share code, notes, and snippets.

@zuphilip
Last active September 3, 2016 16:02
Show Gist options
  • Save zuphilip/f74bb63606b3cbb0ea4412fe1cf3bfe3 to your computer and use it in GitHub Desktop.
Save zuphilip/f74bb63606b3cbb0ea4412fe1cf3bfe3 to your computer and use it in GitHub Desktop.
hocr-tools extension finding the print space of a page w/o margin notes
#!/usr/bin/env python
# Find the print space of a page, without margin notes
import sys,os,string,re
from lxml import html
import argparse
from PIL import Image, ImageDraw
def get_prop(node,name):
title = node.get('title')
props = title.split(';')
for prop in props:
(key,args) = prop.split(None,1)
if key==name: return args
def get_text(node):
textnodes = node.xpath(".//text()")
s = string.join([text for text in textnodes])
return re.sub(r'\s+',' ',s)
def get_bbox(node):
bbox = get_prop(node,'bbox')
if not bbox: return None
return tuple([int(x) for x in bbox.split()])
def guess_lower_boundary(freq, min):
candidates = sorted(freq, key=freq.get, reverse=True)
return [v for v in candidates if v >= min][0]
parser = argparse.ArgumentParser(description="Find print space of a page")
parser.add_argument("-n","--normalize_amount",type=int,default=10,
help="Amount to normalize coordinates with, default: %(default)s")
parser.add_argument("-D","--doublepage",default=False,action="store_true",
help="Whether pages are single or double, default: %(default)s")
parser.add_argument("-i","--inverse_vertically",default=False,action="store_true",
help="Change to measure the vertical coordinate from the bottom instead of the top")
parser.add_argument("hocrfile", help="hOCR file to detect print space in")
parser.add_argument("imagefile", help="corresponding image file")
args = parser.parse_args()
NORMALIZE_AMOUNT = args.normalize_amount
stream = open(args.hocrfile)
doc = html.fromstring(stream.read())
im = Image.open(args.imagefile)
dr = ImageDraw.Draw(im)
width, height = im.size
pages = doc.xpath("//*[@class='ocr_page']")
for page in pages:
print('Page "%s"'%page.get('title'))
lines = page.xpath("//*[@class='ocr_line']")
coord_freq = [{}, {}, {}, {}]
coord_list = []
length_freq = {}
empty_lines = 0
marginal_lines = 0
for line in lines:
if not get_text(line).strip():
empty_lines += 1
continue
coords = get_bbox(line)
if args.inverse_vertically:
coords = list(coords)
coords[1] = height-coords[1]
coords[3] = height-coords[3]
coords = tuple(coords)
# discard small textblock on the margins
if coords[2]<100:
#print coords
marginal_lines += 1
continue
if coords[0]>width-100:
#print coords
marginal_lines += 1
continue
#print "%s"%get_text(line)
coord_list.append(coords)
#print coords
length = ( int(coords[2]) - int(coords[0]) ) / NORMALIZE_AMOUNT * NORMALIZE_AMOUNT
#print length
# dr.rectangle(coords, fill=None, outline = "blue")
if not length in length_freq:
length_freq[length] = 0
length_freq[length] += 1
for pos in range(0,4):
if not coords[pos] in coord_freq[pos]:
coord_freq[pos][coords[pos]] = 0
coord_freq[pos][coords[pos]] += 1
common_length = sorted(length_freq, key=length_freq.get, reverse=True)[0]
first = False
last = False
for coords in coord_list:
w = ( int(coords[2]) - int(coords[0]) ) / NORMALIZE_AMOUNT * NORMALIZE_AMOUNT
if w == common_length:
dr.rectangle(coords, fill=None, outline = "blue")
if not first:
print "defifnining first"
first = (coords[2], coords[1])
print first
else:
last = (coords[2], coords[1])
else:
dr.rectangle(coords, fill=None, outline = "red")
print first
print last
if (first[0] == last[0]):
dr.line([first[0], 0, first[0], height], fill=0, width=3)
else:
print "else"
a = (first[1]-last[1])/(first[0]-last[0])
b = first[1]-first[0]*a
zero = -b/a
xheight = (height-b)/a
dr.line([zero, 0, xheight, height], fill=0, width=3)
if False:
x0 = guess_lower_boundary(coord_freq[0], 0)
y0 = sorted(coord_freq[1], key=coord_freq[1].get)[0]
x1 = guess_lower_boundary(coord_freq[2], 0)
y1 = sorted(coord_freq[3], key=coord_freq[3].get, reverse=True)[0]
dr.rectangle([x0, y0, x1, y1], fill=None, outline="red")
print "%s empty lines skipped"%empty_lines
print "%s marginal lines skipped"%marginal_lines
#print length_freq
#print "\t[%s, %s]"%(x0, y0)
#print "\t[%s, %s]"%(x1, y1)
print common_length
im.save("rectangle.png")
@zuphilip
Copy link
Author

zuphilip commented Jul 18, 2016

Here are three examples with bounding boxes:

rectangle
rectangle
rectangle

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment