Please add this to your code and report!
import os | |
import subprocess | |
import shlex | |
import time | |
def convert_doc_to_pdf(ipfile_path, opfile_path): | |
subprocess.Popen(shlex.split('libreoffice --headless --convert-to pdf "{}" --outdir "{}"'.format(ipfile_path, opfile_path)), stdout=True) |
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line! | |
# filter(lambda x: ) | |
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list) | |
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list) | |
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3])/2) | |
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2])/2) | |
# def group_lines(vertical_lines, horizontal_lines): | |
# verticals = [] | |
# horizontals = [] | |
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:])) | |
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:])) | |
# print diff_vertical_lines, diff_horizontal_lines | |
tolerance = 5 | |
grouped_vertical_lines = [] | |
grouped_horizontal_lines = [] | |
# iter_vertical_lines = iter(vertical_lines) | |
# iter_horizontal_lines = iter(horizontal_lines) | |
prev = None | |
tmp = [] | |
for line in vertical_lines: | |
print line | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[0] - line.bbox[0]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_vertical_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_vertical_lines.append(tmp) | |
tmp = [] | |
prev = None | |
for line in horizontal_lines: | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[1] - line.bbox[1]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_horizontal_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_horizontal_lines.append(tmp) | |
def sort_along_axis(lines, axis): | |
if axis == 'h': | |
return sorted(lines, key=lambda x: x.bbox[0]) | |
elif axis == 'v': | |
return sorted(lines, key=lambda x: x.bbox[1]) | |
return | |
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines] | |
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines] | |
def stitch_lines(lines, axis, threshold=10): | |
stitched_lines = [] | |
if axis == 'H': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
# print line | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H') | |
continue | |
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
elif axis == 'V': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V') | |
# print 'hehhee' | |
continue | |
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
return sub_groups | |
stiched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')] | |
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')] | |
# print stiched_v, stitched_h | |
# coding: utf-8 | |
from __future__ import division | |
import pdfminer | |
import math | |
import logging | |
import traceback | |
import os | |
import csv | |
from collections import defaultdict | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfpage import PDFTextExtractionNotAllowed | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.layout import LAParams | |
from pdfminer.converter import PDFPageAggregator | |
logging.basicConfig(filename='pdf_table_extraction.log', level=logging.ERROR) | |
def main(example_file): | |
table_pages = [] | |
def extract_layout_by_page(pdf_path): | |
laparams = LAParams() | |
fp = open(pdf_path, 'rb') | |
parser = PDFParser(fp) | |
document = PDFDocument(parser) | |
if not document.is_extractable: | |
raise PDFTextExtractionNotAllowed | |
rsrcmgr = PDFResourceManager() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
layouts = [] | |
for page in PDFPage.create_pages(document): | |
interpreter.process_page(page) | |
layouts.append(device.get_result()) | |
return layouts | |
# | |
# | |
# example_file = "Emails/CV_Hardik_Parmar.pdf" | |
# example_file = "docs_converted_to_pdfs/1048889Rahul.pdf" | |
# # example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf" | |
page_layouts = extract_layout_by_page(example_file) | |
objects_on_page = set(type(o) for pg_layout in page_layouts for o in pg_layout) | |
objects_on_page | |
TEXT_ELEMENTS = [ | |
pdfminer.layout.LTTextBox, | |
pdfminer.layout.LTTextBoxHorizontal, | |
pdfminer.layout.LTTextLine, | |
pdfminer.layout.LTTextLineHorizontal | |
] | |
def flatten(lst): | |
"""Flattens a list of lists""" | |
return [subelem for elem in lst for subelem in elem] | |
def extract_characters(element): | |
""" | |
Recursively extracts individual characters from | |
text elements. | |
""" | |
if isinstance(element, pdfminer.layout.LTChar): | |
return [element] | |
if any(isinstance(element, i) for i in TEXT_ELEMENTS): | |
return flatten([extract_characters(e) for e in element]) | |
if isinstance(element, list): | |
return flatten([extract_characters(l) for l in element]) | |
return [] | |
for current_page in page_layouts: | |
texts = [] | |
rects = [] | |
lines_list = [] | |
for e in current_page: | |
if isinstance(e, pdfminer.layout.LTTextBoxHorizontal): | |
texts.append(e) | |
elif isinstance(e, pdfminer.layout.LTRect): | |
rects.append(e) | |
elif isinstance(e, pdfminer.layout.LTCurve) or isinstance(e, pdfminer.layout.LTCurve): | |
lines_list.append(e) | |
characters = extract_characters(texts) | |
# ----------------------------------------------------------------------------------------------------------- | |
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line! | |
# filter(lambda x: ) | |
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list) | |
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list) | |
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3]) / 2) | |
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2]) / 2) | |
# def group_lines(vertical_lines, horizontal_lines): | |
# verticals = [] | |
# horizontals = [] | |
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:])) | |
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:])) | |
# print diff_vertical_lines, diff_horizontal_lines | |
tolerance = 5 | |
grouped_vertical_lines = [] | |
grouped_horizontal_lines = [] | |
# iter_vertical_lines = iter(vertical_lines) | |
# iter_horizontal_lines = iter(horizontal_lines) | |
prev = None | |
tmp = [] | |
for line in vertical_lines: | |
print line | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[0] - line.bbox[0]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_vertical_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_vertical_lines.append(tmp) | |
tmp = [] | |
prev = None | |
for line in horizontal_lines: | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[1] - line.bbox[1]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_horizontal_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_horizontal_lines.append(tmp) | |
def sort_along_axis(lines, axis): | |
if axis == 'h': | |
return sorted(lines, key=lambda x: x.bbox[0]) | |
elif axis == 'v': | |
return sorted(lines, key=lambda x: x.bbox[1]) | |
return | |
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines] | |
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines] | |
def stitch_lines(lines, axis, threshold=10): | |
stitched_lines = [] | |
if axis == 'H': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
# print line | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H') | |
continue | |
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
elif axis == 'V': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V') | |
# print 'hehhee' | |
continue | |
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
return sub_groups | |
stitched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')] | |
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')] | |
# print stitched_v, stitched_h | |
if not any(stitched_h): | |
stitched_h = [] | |
if not any(stitched_v): | |
stitched_v = [] | |
# ---------------------------------------------------------------------------------------------- | |
def does_it_intersect(x, (xmin, xmax)): | |
return (x <= xmax and x >= xmin) | |
def find_bounding_rectangle((x, y), lines): | |
v_intersects = [l for l in lines | |
if l[4] == "V" | |
and does_it_intersect(y, (l[1], l[3]))] | |
h_intersects = [l for l in lines | |
if l[4] == "H" | |
and does_it_intersect(x, (l[0], l[2]))] | |
if len(v_intersects) < 2 or len(h_intersects) < 2: | |
return None | |
v_left = [v[0] for v in v_intersects | |
if v[0] < x] | |
v_right = [v[0] for v in v_intersects | |
if v[0] > x] | |
if len(v_left) == 0 or len(v_right) == 0: | |
return None | |
x0, x1 = max(v_left), min(v_right) | |
h_down = [h[1] for h in h_intersects | |
if h[1] < y] | |
h_up = [h[1] for h in h_intersects | |
if h[1] > y] | |
if len(h_down) == 0 or len(h_up) == 0: | |
return None | |
y0, y1 = max(h_down), min(h_up) | |
return (x0, y0, x1, y1) | |
def width(rect): | |
x0, y0, x1, y1 = rect.bbox | |
return min(x1 - x0, y1 - y0) | |
def area(rect): | |
x0, y0, x1, y1 = rect.bbox | |
return (x1 - x0) * (y1 - y0) | |
def cast_as_line(rect): | |
""" | |
Replaces a retangle with a line based on its longest dimension. | |
""" | |
x0, y0, x1, y1 = rect.bbox | |
if x1 - x0 > y1 - y0: | |
return (x0, y0, x1, y0, "H") | |
else: | |
return (x0, y0, x0, y1, "V") | |
lines = [cast_as_line(r) for r in rects | |
if width(r) < 2 and | |
area(r) > 1] + stitched_v + stitched_h | |
box_char_dict = {} | |
for c in characters: | |
# choose the bounding box that occurs the majority of times for each of these: | |
bboxes = defaultdict(int) | |
l_x, l_y = c.bbox[0], c.bbox[1] | |
bbox_l = find_bounding_rectangle((l_x, l_y), lines) | |
bboxes[bbox_l] += 1 | |
c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2) | |
bbox_c = find_bounding_rectangle((c_x, c_y), lines) | |
bboxes[bbox_c] += 1 | |
u_x, u_y = c.bbox[2], c.bbox[3] | |
bbox_u = find_bounding_rectangle((u_x, u_y), lines) | |
bboxes[bbox_u] += 1 | |
# if all values are in different boxes, default to character center. | |
# otherwise choose the majority. | |
if max(bboxes.values()) == 1: | |
bbox = bbox_c | |
else: | |
bbox = max(bboxes.items(), key=lambda x: x[1])[0] | |
if bbox is None: | |
continue | |
if bbox in box_char_dict.keys(): | |
box_char_dict[bbox].append(c) | |
continue | |
box_char_dict[bbox] = [c] | |
xmin, ymin, xmax, ymax = current_page.bbox | |
for x in range(int(xmin), int(xmax), 10): | |
for y in range(int(ymin), int(ymax), 10): | |
bbox = find_bounding_rectangle((x, y), lines) | |
if bbox is None: | |
continue | |
if bbox in box_char_dict.keys(): | |
continue | |
box_char_dict[bbox] = [] | |
def chars_to_string(chars): | |
""" | |
Converts a collection of characters into a string, by ordering them left to right, | |
then top to bottom. | |
""" | |
if not chars: | |
return "" | |
rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True) | |
text = "" | |
for row in rows: | |
sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0]) | |
text += "".join(c.get_text() for c in sorted_row) | |
return text | |
def boxes_to_table(box_record_dict): | |
""" | |
Converts a dictionary of cell:characters mapping into a python list | |
of lists of strings. Tries to split cells into rows, then for each row | |
breaks it down into columns. | |
""" | |
boxes = box_record_dict.keys() | |
rows = sorted(list(set(b[1] for b in boxes)), reverse=True) | |
table = [] | |
for row in rows: | |
sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0]) | |
table.append([chars_to_string(box_record_dict[b]) for b in sorted_row]) | |
return table | |
table_pages.append(boxes_to_table(box_char_dict)) | |
return table_pages | |
def get_all_pdf_file_paths(root_path): | |
return (os.path.abspath(os.path.join(root_, doc_files)) for root_, dir_, files in os.walk(root_path) | |
for doc_files in filter(lambda x: x.endswith('.pdf'), files)) | |
# | |
# print main('docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf') | |
# print main('/home/wolfram/project@work/resume-intent-classification/resumes_data/Lalit Kumar Pati_270716.pdf') | |
with open('tables_from_pdf_revised.csv', 'w') as f: | |
writer = csv.writer(f) | |
for file_p in get_all_pdf_file_paths('resumes_data'): | |
try: | |
table = main(file_p) | |
writer.writerow([file_p, table]) | |
print file_p, table | |
except Exception as e: | |
print "errror", file_p, e, traceback.format_exc() | |
logging.error('FILE_PATH %s \n %s' % (file_p, traceback.format_exc())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment