-
-
Save bf4648/626a2066f50f8b9a8f9613b38cf6c6f9 to your computer and use it in GitHub Desktop.
Please add this to your code and report!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
import shlex | |
import time | |
def convert_doc_to_pdf(ipfile_path, opfile_path): | |
subprocess.Popen(shlex.split('libreoffice --headless --convert-to pdf "{}" --outdir "{}"'.format(ipfile_path, opfile_path)), stdout=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line! | |
# filter(lambda x: ) | |
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list) | |
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list) | |
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3])/2) | |
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2])/2) | |
# def group_lines(vertical_lines, horizontal_lines): | |
# verticals = [] | |
# horizontals = [] | |
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:])) | |
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:])) | |
# print diff_vertical_lines, diff_horizontal_lines | |
tolerance = 5 | |
grouped_vertical_lines = [] | |
grouped_horizontal_lines = [] | |
# iter_vertical_lines = iter(vertical_lines) | |
# iter_horizontal_lines = iter(horizontal_lines) | |
prev = None | |
tmp = [] | |
for line in vertical_lines: | |
print line | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[0] - line.bbox[0]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_vertical_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_vertical_lines.append(tmp) | |
tmp = [] | |
prev = None | |
for line in horizontal_lines: | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[1] - line.bbox[1]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_horizontal_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_horizontal_lines.append(tmp) | |
def sort_along_axis(lines, axis): | |
if axis == 'h': | |
return sorted(lines, key=lambda x: x.bbox[0]) | |
elif axis == 'v': | |
return sorted(lines, key=lambda x: x.bbox[1]) | |
return | |
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines] | |
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines] | |
def stitch_lines(lines, axis, threshold=10): | |
stitched_lines = [] | |
if axis == 'H': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
# print line | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H') | |
continue | |
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
elif axis == 'V': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V') | |
# print 'hehhee' | |
continue | |
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
return sub_groups | |
stiched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')] | |
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')] | |
# print stiched_v, stitched_h | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import division | |
import pdfminer | |
import math | |
import logging | |
import traceback | |
import os | |
import csv | |
from collections import defaultdict | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfpage import PDFTextExtractionNotAllowed | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.layout import LAParams | |
from pdfminer.converter import PDFPageAggregator | |
logging.basicConfig(filename='pdf_table_extraction.log', level=logging.ERROR) | |
def main(example_file): | |
table_pages = [] | |
def extract_layout_by_page(pdf_path): | |
laparams = LAParams() | |
fp = open(pdf_path, 'rb') | |
parser = PDFParser(fp) | |
document = PDFDocument(parser) | |
if not document.is_extractable: | |
raise PDFTextExtractionNotAllowed | |
rsrcmgr = PDFResourceManager() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
layouts = [] | |
for page in PDFPage.create_pages(document): | |
interpreter.process_page(page) | |
layouts.append(device.get_result()) | |
return layouts | |
# | |
# | |
# example_file = "Emails/CV_Hardik_Parmar.pdf" | |
# example_file = "docs_converted_to_pdfs/1048889Rahul.pdf" | |
# # example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf" | |
page_layouts = extract_layout_by_page(example_file) | |
objects_on_page = set(type(o) for pg_layout in page_layouts for o in pg_layout) | |
objects_on_page | |
TEXT_ELEMENTS = [ | |
pdfminer.layout.LTTextBox, | |
pdfminer.layout.LTTextBoxHorizontal, | |
pdfminer.layout.LTTextLine, | |
pdfminer.layout.LTTextLineHorizontal | |
] | |
def flatten(lst): | |
"""Flattens a list of lists""" | |
return [subelem for elem in lst for subelem in elem] | |
def extract_characters(element): | |
""" | |
Recursively extracts individual characters from | |
text elements. | |
""" | |
if isinstance(element, pdfminer.layout.LTChar): | |
return [element] | |
if any(isinstance(element, i) for i in TEXT_ELEMENTS): | |
return flatten([extract_characters(e) for e in element]) | |
if isinstance(element, list): | |
return flatten([extract_characters(l) for l in element]) | |
return [] | |
for current_page in page_layouts: | |
texts = [] | |
rects = [] | |
lines_list = [] | |
for e in current_page: | |
if isinstance(e, pdfminer.layout.LTTextBoxHorizontal): | |
texts.append(e) | |
elif isinstance(e, pdfminer.layout.LTRect): | |
rects.append(e) | |
elif isinstance(e, pdfminer.layout.LTCurve) or isinstance(e, pdfminer.layout.LTCurve): | |
lines_list.append(e) | |
characters = extract_characters(texts) | |
# ----------------------------------------------------------------------------------------------------------- | |
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line! | |
# filter(lambda x: ) | |
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list) | |
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list) | |
horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3]) / 2) | |
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2]) / 2) | |
# def group_lines(vertical_lines, horizontal_lines): | |
# verticals = [] | |
# horizontals = [] | |
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:])) | |
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:])) | |
# print diff_vertical_lines, diff_horizontal_lines | |
tolerance = 5 | |
grouped_vertical_lines = [] | |
grouped_horizontal_lines = [] | |
# iter_vertical_lines = iter(vertical_lines) | |
# iter_horizontal_lines = iter(horizontal_lines) | |
prev = None | |
tmp = [] | |
for line in vertical_lines: | |
print line | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[0] - line.bbox[0]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_vertical_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_vertical_lines.append(tmp) | |
tmp = [] | |
prev = None | |
for line in horizontal_lines: | |
if not prev: | |
prev = line | |
tmp.append(line) | |
continue | |
if abs(prev.bbox[1] - line.bbox[1]) < tolerance: | |
tmp.append(line) | |
prev = line | |
else: | |
grouped_horizontal_lines.append(tmp) | |
tmp = [line] | |
prev = line | |
grouped_horizontal_lines.append(tmp) | |
def sort_along_axis(lines, axis): | |
if axis == 'h': | |
return sorted(lines, key=lambda x: x.bbox[0]) | |
elif axis == 'v': | |
return sorted(lines, key=lambda x: x.bbox[1]) | |
return | |
grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines] | |
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines] | |
def stitch_lines(lines, axis, threshold=10): | |
stitched_lines = [] | |
if axis == 'H': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
# print line | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H') | |
continue | |
if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
elif axis == 'V': | |
prev = None | |
sub_groups = [] | |
for line in lines: | |
if not prev: | |
prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V') | |
# print 'hehhee' | |
continue | |
if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold: | |
prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V') | |
else: | |
sub_groups.append(prev) | |
if not sub_groups: | |
sub_groups.append(prev) | |
return sub_groups | |
stitched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')] | |
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')] | |
# print stitched_v, stitched_h | |
if not any(stitched_h): | |
stitched_h = [] | |
if not any(stitched_v): | |
stitched_v = [] | |
# ---------------------------------------------------------------------------------------------- | |
def does_it_intersect(x, (xmin, xmax)): | |
return (x <= xmax and x >= xmin) | |
def find_bounding_rectangle((x, y), lines): | |
v_intersects = [l for l in lines | |
if l[4] == "V" | |
and does_it_intersect(y, (l[1], l[3]))] | |
h_intersects = [l for l in lines | |
if l[4] == "H" | |
and does_it_intersect(x, (l[0], l[2]))] | |
if len(v_intersects) < 2 or len(h_intersects) < 2: | |
return None | |
v_left = [v[0] for v in v_intersects | |
if v[0] < x] | |
v_right = [v[0] for v in v_intersects | |
if v[0] > x] | |
if len(v_left) == 0 or len(v_right) == 0: | |
return None | |
x0, x1 = max(v_left), min(v_right) | |
h_down = [h[1] for h in h_intersects | |
if h[1] < y] | |
h_up = [h[1] for h in h_intersects | |
if h[1] > y] | |
if len(h_down) == 0 or len(h_up) == 0: | |
return None | |
y0, y1 = max(h_down), min(h_up) | |
return (x0, y0, x1, y1) | |
def width(rect): | |
x0, y0, x1, y1 = rect.bbox | |
return min(x1 - x0, y1 - y0) | |
def area(rect): | |
x0, y0, x1, y1 = rect.bbox | |
return (x1 - x0) * (y1 - y0) | |
def cast_as_line(rect): | |
""" | |
Replaces a retangle with a line based on its longest dimension. | |
""" | |
x0, y0, x1, y1 = rect.bbox | |
if x1 - x0 > y1 - y0: | |
return (x0, y0, x1, y0, "H") | |
else: | |
return (x0, y0, x0, y1, "V") | |
lines = [cast_as_line(r) for r in rects | |
if width(r) < 2 and | |
area(r) > 1] + stitched_v + stitched_h | |
box_char_dict = {} | |
for c in characters: | |
# choose the bounding box that occurs the majority of times for each of these: | |
bboxes = defaultdict(int) | |
l_x, l_y = c.bbox[0], c.bbox[1] | |
bbox_l = find_bounding_rectangle((l_x, l_y), lines) | |
bboxes[bbox_l] += 1 | |
c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2) | |
bbox_c = find_bounding_rectangle((c_x, c_y), lines) | |
bboxes[bbox_c] += 1 | |
u_x, u_y = c.bbox[2], c.bbox[3] | |
bbox_u = find_bounding_rectangle((u_x, u_y), lines) | |
bboxes[bbox_u] += 1 | |
# if all values are in different boxes, default to character center. | |
# otherwise choose the majority. | |
if max(bboxes.values()) == 1: | |
bbox = bbox_c | |
else: | |
bbox = max(bboxes.items(), key=lambda x: x[1])[0] | |
if bbox is None: | |
continue | |
if bbox in box_char_dict.keys(): | |
box_char_dict[bbox].append(c) | |
continue | |
box_char_dict[bbox] = [c] | |
xmin, ymin, xmax, ymax = current_page.bbox | |
for x in range(int(xmin), int(xmax), 10): | |
for y in range(int(ymin), int(ymax), 10): | |
bbox = find_bounding_rectangle((x, y), lines) | |
if bbox is None: | |
continue | |
if bbox in box_char_dict.keys(): | |
continue | |
box_char_dict[bbox] = [] | |
def chars_to_string(chars): | |
""" | |
Converts a collection of characters into a string, by ordering them left to right, | |
then top to bottom. | |
""" | |
if not chars: | |
return "" | |
rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True) | |
text = "" | |
for row in rows: | |
sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0]) | |
text += "".join(c.get_text() for c in sorted_row) | |
return text | |
def boxes_to_table(box_record_dict): | |
""" | |
Converts a dictionary of cell:characters mapping into a python list | |
of lists of strings. Tries to split cells into rows, then for each row | |
breaks it down into columns. | |
""" | |
boxes = box_record_dict.keys() | |
rows = sorted(list(set(b[1] for b in boxes)), reverse=True) | |
table = [] | |
for row in rows: | |
sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0]) | |
table.append([chars_to_string(box_record_dict[b]) for b in sorted_row]) | |
return table | |
table_pages.append(boxes_to_table(box_char_dict)) | |
return table_pages | |
def get_all_pdf_file_paths(root_path): | |
return (os.path.abspath(os.path.join(root_, doc_files)) for root_, dir_, files in os.walk(root_path) | |
for doc_files in filter(lambda x: x.endswith('.pdf'), files)) | |
# | |
# print main('docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf') | |
# print main('/home/wolfram/project@work/resume-intent-classification/resumes_data/Lalit Kumar Pati_270716.pdf') | |
with open('tables_from_pdf_revised.csv', 'w') as f: | |
writer = csv.writer(f) | |
for file_p in get_all_pdf_file_paths('resumes_data'): | |
try: | |
table = main(file_p) | |
writer.writerow([file_p, table]) | |
print file_p, table | |
except Exception as e: | |
print "errror", file_p, e, traceback.format_exc() | |
logging.error('FILE_PATH %s \n %s' % (file_p, traceback.format_exc())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment