Last active
September 3, 2016 16:02
-
-
Save zuphilip/f74bb63606b3cbb0ea4412fe1cf3bfe3 to your computer and use it in GitHub Desktop.
hocr-tools extension finding the print space of a page w/o margin notes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Find the print space of a page, without margin notes | |
import sys,os,string,re | |
from lxml import html | |
import argparse | |
from PIL import Image, ImageDraw | |
def get_prop(node,name): | |
title = node.get('title') | |
props = title.split(';') | |
for prop in props: | |
(key,args) = prop.split(None,1) | |
if key==name: return args | |
def get_text(node): | |
textnodes = node.xpath(".//text()") | |
s = string.join([text for text in textnodes]) | |
return re.sub(r'\s+',' ',s) | |
def get_bbox(node): | |
bbox = get_prop(node,'bbox') | |
if not bbox: return None | |
return tuple([int(x) for x in bbox.split()]) | |
def guess_lower_boundary(freq, min): | |
candidates = sorted(freq, key=freq.get, reverse=True) | |
return [v for v in candidates if v >= min][0] | |
parser = argparse.ArgumentParser(description="Find print space of a page") | |
parser.add_argument("-n","--normalize_amount",type=int,default=10, | |
help="Amount to normalize coordinates with, default: %(default)s") | |
parser.add_argument("-D","--doublepage",default=False,action="store_true", | |
help="Whether pages are single or double, default: %(default)s") | |
parser.add_argument("-i","--inverse_vertically",default=False,action="store_true", | |
help="Change to measure the vertical coordinate from the bottom instead of the top") | |
parser.add_argument("hocrfile", help="hOCR file to detect print space in") | |
parser.add_argument("imagefile", help="corresponding image file") | |
args = parser.parse_args() | |
NORMALIZE_AMOUNT = args.normalize_amount | |
stream = open(args.hocrfile) | |
doc = html.fromstring(stream.read()) | |
im = Image.open(args.imagefile) | |
dr = ImageDraw.Draw(im) | |
width, height = im.size | |
pages = doc.xpath("//*[@class='ocr_page']") | |
for page in pages: | |
print('Page "%s"'%page.get('title')) | |
lines = page.xpath("//*[@class='ocr_line']") | |
coord_freq = [{}, {}, {}, {}] | |
coord_list = [] | |
length_freq = {} | |
empty_lines = 0 | |
marginal_lines = 0 | |
for line in lines: | |
if not get_text(line).strip(): | |
empty_lines += 1 | |
continue | |
coords = get_bbox(line) | |
if args.inverse_vertically: | |
coords = list(coords) | |
coords[1] = height-coords[1] | |
coords[3] = height-coords[3] | |
coords = tuple(coords) | |
# discard small textblock on the margins | |
if coords[2]<100: | |
#print coords | |
marginal_lines += 1 | |
continue | |
if coords[0]>width-100: | |
#print coords | |
marginal_lines += 1 | |
continue | |
#print "%s"%get_text(line) | |
coord_list.append(coords) | |
#print coords | |
length = ( int(coords[2]) - int(coords[0]) ) / NORMALIZE_AMOUNT * NORMALIZE_AMOUNT | |
#print length | |
# dr.rectangle(coords, fill=None, outline = "blue") | |
if not length in length_freq: | |
length_freq[length] = 0 | |
length_freq[length] += 1 | |
for pos in range(0,4): | |
if not coords[pos] in coord_freq[pos]: | |
coord_freq[pos][coords[pos]] = 0 | |
coord_freq[pos][coords[pos]] += 1 | |
common_length = sorted(length_freq, key=length_freq.get, reverse=True)[0] | |
first = False | |
last = False | |
for coords in coord_list: | |
w = ( int(coords[2]) - int(coords[0]) ) / NORMALIZE_AMOUNT * NORMALIZE_AMOUNT | |
if w == common_length: | |
dr.rectangle(coords, fill=None, outline = "blue") | |
if not first: | |
print "defifnining first" | |
first = (coords[2], coords[1]) | |
print first | |
else: | |
last = (coords[2], coords[1]) | |
else: | |
dr.rectangle(coords, fill=None, outline = "red") | |
print first | |
print last | |
if (first[0] == last[0]): | |
dr.line([first[0], 0, first[0], height], fill=0, width=3) | |
else: | |
print "else" | |
a = (first[1]-last[1])/(first[0]-last[0]) | |
b = first[1]-first[0]*a | |
zero = -b/a | |
xheight = (height-b)/a | |
dr.line([zero, 0, xheight, height], fill=0, width=3) | |
if False: | |
x0 = guess_lower_boundary(coord_freq[0], 0) | |
y0 = sorted(coord_freq[1], key=coord_freq[1].get)[0] | |
x1 = guess_lower_boundary(coord_freq[2], 0) | |
y1 = sorted(coord_freq[3], key=coord_freq[3].get, reverse=True)[0] | |
dr.rectangle([x0, y0, x1, y1], fill=None, outline="red") | |
print "%s empty lines skipped"%empty_lines | |
print "%s marginal lines skipped"%marginal_lines | |
#print length_freq | |
#print "\t[%s, %s]"%(x0, y0) | |
#print "\t[%s, %s]"%(x1, y1) | |
print common_length | |
im.save("rectangle.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here are three examples with bounding boxes: