Skip to content

Instantly share code, notes, and snippets.

@bennokr
Created February 19, 2020 07:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bennokr/0dab1293e99bb1b9aef7532ddd408913 to your computer and use it in GitHub Desktop.
Save bennokr/0dab1293e99bb1b9aef7532ddd408913 to your computer and use it in GitHub Desktop.
PDF table extraction
import collections, itertools
import fitz
def is_inside(inner, outer):
return (
(inner['x2'] >= outer['x1']) and
(inner['y2'] >= outer['y1']) and
(outer['x2'] >= inner['x1']) and
(outer['y2'] >= inner['y1'])
)
def merge_words_hor(words, bound_x=1, bound_y=1):
i_word = dict(enumerate(words))
merge = {i:i for i in i_word}
for i1, (bb1, w1) in i_word.items():
for i2, (bb2, w2) in i_word.items():
if i2 != i1:
if abs(bb1['y1'] - bb2['y1']) < bound_y and abs(bb1['x2'] - bb2['x1']) < bound_x:
merge[i2] = merge[i1]
for i in set(merge.values()):
merged = [i_word[k] for k,v in merge.items() if v == i]
bb = dict(
x1=min(bb['x1'] for bb,w in merged),
y1=min(bb['y1'] for bb,w in merged),
x2=max(bb['x2'] for bb,w in merged),
y2=max(bb['y2'] for bb,w in merged),
)
bb['xc'] = (bb['x1']+bb['x2'])/2
bb['yc'] = (bb['y1']+bb['y2'])/2
w = ' '.join(w for _,w in merged)
yield bb, w
def get_lines(words, rounding=1):
def r(v):
return int(v*rounding)/rounding
point_bboxes = collections.defaultdict(lambda: collections.defaultdict(list))
for bbox, w in words:
for k,v in bbox.items():
point_bboxes[k][r(v)].append(bbox)
for k, v_bboxes in point_bboxes.items():
for v, bboxes in v_bboxes.items():
v = sum(bb[k] for bb in bboxes) / len(bboxes)
if len(bboxes) > 1:
if k[0] == 'x':
points = dict(x1=v, y1=min(b['y1'] for b in bboxes), x2=v, y2=max(b['y2'] for b in bboxes))
else:
points = dict(x1=min(b['x1'] for b in bboxes), y1=v, x2=max(b['x2'] for b in bboxes), y2=v)
yield k, (v, points, bboxes)
def get_word_clusters(lines, words, bound_x=1, bound_y=1):
def intersects(bb, p):
bx, by = bound_x, bound_y
bb = dict(x1=bb['x1']-bx, y1=bb['y1']-by, x2=bb['x2']+bx, y2=bb['y2']+by )
if p['x1'] == p['x2']: # vertical
return (p['x1'] >= bb['x1'] and p['x1'] <= bb['x2'] and
p['y1'] <= bb['y2'] and p['y2'] >= bb['y1'])
else:
return (p['y1'] >= bb['y1'] and p['y1'] <= bb['y2'] and
p['x1'] <= bb['x2'] and p['x2'] >= bb['x1'])
for k, klines in itertools.groupby(sorted(lines), lambda x:x[0][0]):
word_cluster = {i:i for i,_ in enumerate(words)}
for _, (v, p, bboxes) in klines:
intersecting = [i for i,(bb,w) in enumerate(words) if intersects(bb,p)]
if intersecting:
for i in intersecting:
word_cluster[i] = word_cluster[intersecting[0]]
cluster_word_pairs = sorted((v,k) for k,v in word_cluster.items())
clusters = []
for ci, wis in itertools.groupby(cluster_word_pairs, lambda x:x[0]):
_, wis = zip(*wis)
v = sum(bb[k+'1'] for i in wis for bb, w in [words[i]]) / len(wis)
clusters.append( (v, wis) )
yield k, sorted(clusters)
def make_grid(pos_clusters, words):
pos_wi_i = {}
pos_n = {'x':0, 'y':0}
for k, clusters in pos_clusters.items():
pos_wi_i[k] = {}
pos_n[k] = len(clusters)
for ci, (v,wis) in enumerate(clusters):
for wi in wis:
pos_wi_i[k][wi] = ci
grid = [['' for _ in range(pos_n['x'])] for _ in range(pos_n['y'])]
for i,(bb,w) in enumerate(words):
xi, yi = pos_wi_i['x'][i], pos_wi_i['y'][i]
grid[ yi ][ xi ] = w if not grid[ yi ][ xi ] else grid[ yi ][ xi ] + ' ' + w
return grid
def extract(pdf_fname, pagenr, fig_bbox):
doc = fitz.open(pdf_fname)
page = doc[pagenr]
words = []
for x1, y1, x2, y2, word, _, _, _ in page.getTextWords():
bbox = dict(x1=x1, y1=y1, x2=x2, y2=y2, xc=(x1+x2/2), yc=(y1+y2)/2)
if is_inside(bbox, fig_bbox):
words.append( (bbox, word ) )
words = list(merge_words_hor(words, bound_x=4, bound_y=2))
lines = list(get_lines(words, rounding=.5))
pos_clusters = dict(get_word_clusters(lines, words, bound_x=1, bound_y=0))
grid = make_grid(pos_clusters, words)
return grid
if __name__ == '__main__':
import sys, csv
_, pdf_fname, pagenr, bbox_string = sys.argv
x1, y1, x2, y2 = bbox_string.split(',')
grid = extract(pdf_fname, int(pagenr), dict(x1=int(x1), y1=int(y1), x2=int(x2), y2=int(y2)))
cw = csv.writer(sys.stdout)
for row in grid:
cw.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment