Skip to content

Instantly share code, notes, and snippets.

@bf4648
Forked from wolframalpha/doc_to_pdf.py
Created July 3, 2020 16:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bf4648/626a2066f50f8b9a8f9613b38cf6c6f9 to your computer and use it in GitHub Desktop.
Save bf4648/626a2066f50f8b9a8f9613b38cf6c6f9 to your computer and use it in GitHub Desktop.
Please add this to your code and report!
import os
import subprocess
import shlex
import time
def convert_doc_to_pdf(ipfile_path, opfile_path):
subprocess.Popen(shlex.split('libreoffice --headless --convert-to pdf "{}" --outdir "{}"'.format(ipfile_path, opfile_path)), stdout=True)
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line!
# filter(lambda x: )
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3])/2)
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2])/2)
# def group_lines(vertical_lines, horizontal_lines):
# verticals = []
# horizontals = []
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))
# print diff_vertical_lines, diff_horizontal_lines
tolerance = 5
grouped_vertical_lines = []
grouped_horizontal_lines = []
# iter_vertical_lines = iter(vertical_lines)
# iter_horizontal_lines = iter(horizontal_lines)
prev = None
tmp = []
for line in vertical_lines:
print line
if not prev:
prev = line
tmp.append(line)
continue
if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
tmp.append(line)
prev = line
else:
grouped_vertical_lines.append(tmp)
tmp = [line]
prev = line
grouped_vertical_lines.append(tmp)
tmp = []
prev = None
for line in horizontal_lines:
if not prev:
prev = line
tmp.append(line)
continue
if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
tmp.append(line)
prev = line
else:
grouped_horizontal_lines.append(tmp)
tmp = [line]
prev = line
grouped_horizontal_lines.append(tmp)
def sort_along_axis(lines, axis):
if axis == 'h':
return sorted(lines, key=lambda x: x.bbox[0])
elif axis == 'v':
return sorted(lines, key=lambda x: x.bbox[1])
return
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]
def stitch_lines(lines, axis, threshold=10):
stitched_lines = []
if axis == 'H':
prev = None
sub_groups = []
for line in lines:
# print line
if not prev:
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
continue
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
else:
sub_groups.append(prev)
if not sub_groups:
sub_groups.append(prev)
elif axis == 'V':
prev = None
sub_groups = []
for line in lines:
if not prev:
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
# print 'hehhee'
continue
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
else:
sub_groups.append(prev)
if not sub_groups:
sub_groups.append(prev)
return sub_groups
stiched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]
# print stiched_v, stitched_h
# coding: utf-8
from __future__ import division
import pdfminer
import math
import logging
import traceback
import os
import csv
from collections import defaultdict
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
logging.basicConfig(filename='pdf_table_extraction.log', level=logging.ERROR)
def main(example_file):
table_pages = []
def extract_layout_by_page(pdf_path):
laparams = LAParams()
fp = open(pdf_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
layouts = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layouts.append(device.get_result())
return layouts
#
#
# example_file = "Emails/CV_Hardik_Parmar.pdf"
# example_file = "docs_converted_to_pdfs/1048889Rahul.pdf"
# # example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf"
page_layouts = extract_layout_by_page(example_file)
objects_on_page = set(type(o) for pg_layout in page_layouts for o in pg_layout)
objects_on_page
TEXT_ELEMENTS = [
pdfminer.layout.LTTextBox,
pdfminer.layout.LTTextBoxHorizontal,
pdfminer.layout.LTTextLine,
pdfminer.layout.LTTextLineHorizontal
]
def flatten(lst):
"""Flattens a list of lists"""
return [subelem for elem in lst for subelem in elem]
def extract_characters(element):
"""
Recursively extracts individual characters from
text elements.
"""
if isinstance(element, pdfminer.layout.LTChar):
return [element]
if any(isinstance(element, i) for i in TEXT_ELEMENTS):
return flatten([extract_characters(e) for e in element])
if isinstance(element, list):
return flatten([extract_characters(l) for l in element])
return []
for current_page in page_layouts:
texts = []
rects = []
lines_list = []
for e in current_page:
if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
texts.append(e)
elif isinstance(e, pdfminer.layout.LTRect):
rects.append(e)
elif isinstance(e, pdfminer.layout.LTCurve) or isinstance(e, pdfminer.layout.LTCurve):
lines_list.append(e)
characters = extract_characters(texts)
# -----------------------------------------------------------------------------------------------------------
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line!
# filter(lambda x: )
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3]) / 2)
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2]) / 2)
# def group_lines(vertical_lines, horizontal_lines):
# verticals = []
# horizontals = []
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))
# print diff_vertical_lines, diff_horizontal_lines
tolerance = 5
grouped_vertical_lines = []
grouped_horizontal_lines = []
# iter_vertical_lines = iter(vertical_lines)
# iter_horizontal_lines = iter(horizontal_lines)
prev = None
tmp = []
for line in vertical_lines:
print line
if not prev:
prev = line
tmp.append(line)
continue
if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
tmp.append(line)
prev = line
else:
grouped_vertical_lines.append(tmp)
tmp = [line]
prev = line
grouped_vertical_lines.append(tmp)
tmp = []
prev = None
for line in horizontal_lines:
if not prev:
prev = line
tmp.append(line)
continue
if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
tmp.append(line)
prev = line
else:
grouped_horizontal_lines.append(tmp)
tmp = [line]
prev = line
grouped_horizontal_lines.append(tmp)
def sort_along_axis(lines, axis):
if axis == 'h':
return sorted(lines, key=lambda x: x.bbox[0])
elif axis == 'v':
return sorted(lines, key=lambda x: x.bbox[1])
return
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]
def stitch_lines(lines, axis, threshold=10):
stitched_lines = []
if axis == 'H':
prev = None
sub_groups = []
for line in lines:
# print line
if not prev:
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
continue
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
else:
sub_groups.append(prev)
if not sub_groups:
sub_groups.append(prev)
elif axis == 'V':
prev = None
sub_groups = []
for line in lines:
if not prev:
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
# print 'hehhee'
continue
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
else:
sub_groups.append(prev)
if not sub_groups:
sub_groups.append(prev)
return sub_groups
stitched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]
# print stitched_v, stitched_h
if not any(stitched_h):
stitched_h = []
if not any(stitched_v):
stitched_v = []
# ----------------------------------------------------------------------------------------------
def does_it_intersect(x, (xmin, xmax)):
return (x <= xmax and x >= xmin)
def find_bounding_rectangle((x, y), lines):
v_intersects = [l for l in lines
if l[4] == "V"
and does_it_intersect(y, (l[1], l[3]))]
h_intersects = [l for l in lines
if l[4] == "H"
and does_it_intersect(x, (l[0], l[2]))]
if len(v_intersects) < 2 or len(h_intersects) < 2:
return None
v_left = [v[0] for v in v_intersects
if v[0] < x]
v_right = [v[0] for v in v_intersects
if v[0] > x]
if len(v_left) == 0 or len(v_right) == 0:
return None
x0, x1 = max(v_left), min(v_right)
h_down = [h[1] for h in h_intersects
if h[1] < y]
h_up = [h[1] for h in h_intersects
if h[1] > y]
if len(h_down) == 0 or len(h_up) == 0:
return None
y0, y1 = max(h_down), min(h_up)
return (x0, y0, x1, y1)
def width(rect):
x0, y0, x1, y1 = rect.bbox
return min(x1 - x0, y1 - y0)
def area(rect):
x0, y0, x1, y1 = rect.bbox
return (x1 - x0) * (y1 - y0)
def cast_as_line(rect):
"""
Replaces a retangle with a line based on its longest dimension.
"""
x0, y0, x1, y1 = rect.bbox
if x1 - x0 > y1 - y0:
return (x0, y0, x1, y0, "H")
else:
return (x0, y0, x0, y1, "V")
lines = [cast_as_line(r) for r in rects
if width(r) < 2 and
area(r) > 1] + stitched_v + stitched_h
box_char_dict = {}
for c in characters:
# choose the bounding box that occurs the majority of times for each of these:
bboxes = defaultdict(int)
l_x, l_y = c.bbox[0], c.bbox[1]
bbox_l = find_bounding_rectangle((l_x, l_y), lines)
bboxes[bbox_l] += 1
c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2)
bbox_c = find_bounding_rectangle((c_x, c_y), lines)
bboxes[bbox_c] += 1
u_x, u_y = c.bbox[2], c.bbox[3]
bbox_u = find_bounding_rectangle((u_x, u_y), lines)
bboxes[bbox_u] += 1
# if all values are in different boxes, default to character center.
# otherwise choose the majority.
if max(bboxes.values()) == 1:
bbox = bbox_c
else:
bbox = max(bboxes.items(), key=lambda x: x[1])[0]
if bbox is None:
continue
if bbox in box_char_dict.keys():
box_char_dict[bbox].append(c)
continue
box_char_dict[bbox] = [c]
xmin, ymin, xmax, ymax = current_page.bbox
for x in range(int(xmin), int(xmax), 10):
for y in range(int(ymin), int(ymax), 10):
bbox = find_bounding_rectangle((x, y), lines)
if bbox is None:
continue
if bbox in box_char_dict.keys():
continue
box_char_dict[bbox] = []
def chars_to_string(chars):
"""
Converts a collection of characters into a string, by ordering them left to right,
then top to bottom.
"""
if not chars:
return ""
rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True)
text = ""
for row in rows:
sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0])
text += "".join(c.get_text() for c in sorted_row)
return text
def boxes_to_table(box_record_dict):
"""
Converts a dictionary of cell:characters mapping into a python list
of lists of strings. Tries to split cells into rows, then for each row
breaks it down into columns.
"""
boxes = box_record_dict.keys()
rows = sorted(list(set(b[1] for b in boxes)), reverse=True)
table = []
for row in rows:
sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0])
table.append([chars_to_string(box_record_dict[b]) for b in sorted_row])
return table
table_pages.append(boxes_to_table(box_char_dict))
return table_pages
def get_all_pdf_file_paths(root_path):
return (os.path.abspath(os.path.join(root_, doc_files)) for root_, dir_, files in os.walk(root_path)
for doc_files in filter(lambda x: x.endswith('.pdf'), files))
#
# print main('docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf')
# print main('/home/wolfram/project@work/resume-intent-classification/resumes_data/Lalit Kumar Pati_270716.pdf')
with open('tables_from_pdf_revised.csv', 'w') as f:
writer = csv.writer(f)
for file_p in get_all_pdf_file_paths('resumes_data'):
try:
table = main(file_p)
writer.writerow([file_p, table])
print file_p, table
except Exception as e:
print "errror", file_p, e, traceback.format_exc()
logging.error('FILE_PATH %s \n %s' % (file_p, traceback.format_exc()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment