bf4648/doc_to_pdf.py

## doc_to_pdf.py
import os
import subprocess
import shlex
import time
def convert_doc_to_pdf(ipfile_path, opfile_path):
    subprocess.Popen(shlex.split('libreoffice --headless --convert-to pdf "{}" --outdir "{}"'.format(ipfile_path, opfile_path)), stdout=True)

## line_table_creator.py
threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line!
# filter(lambda x: )
horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)

horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3])/2)
vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2])/2)
# def group_lines(vertical_lines, horizontal_lines):
#     verticals = []
#     horizontals = []
# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))

# print diff_vertical_lines, diff_horizontal_lines

tolerance = 5
grouped_vertical_lines = []
grouped_horizontal_lines = []

# iter_vertical_lines = iter(vertical_lines)
# iter_horizontal_lines = iter(horizontal_lines)
prev = None
tmp = []
for line in vertical_lines:
    print line
    if not prev:
        prev = line
        tmp.append(line)
        continue
    if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
        tmp.append(line)
        prev = line
    else:
        grouped_vertical_lines.append(tmp)
        tmp = [line]
        prev = line
grouped_vertical_lines.append(tmp)

tmp = []
prev = None
for line in horizontal_lines:
    if not prev:
        prev = line
        tmp.append(line)
        continue
    if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
        tmp.append(line)
        prev = line
    else:
        grouped_horizontal_lines.append(tmp)
        tmp = [line]
        prev = line
grouped_horizontal_lines.append(tmp)

def sort_along_axis(lines, axis):
    if axis == 'h':
        return sorted(lines, key=lambda x: x.bbox[0])
    elif axis == 'v':
        return sorted(lines, key=lambda x: x.bbox[1])
    return

grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]

def stitch_lines(lines, axis, threshold=10):
    stitched_lines = []
    if axis == 'H':
        prev = None
        sub_groups = []
        for line in lines:
#             print line
            if not prev:
                prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
                continue
            if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
                prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
            else:
                sub_groups.append(prev)
        if not sub_groups:
            sub_groups.append(prev)
    elif axis == 'V':
        prev = None
        sub_groups = []
        for line in lines:
            if not prev:
                prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
#                 print 'hehhee'
                continue
            if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
                prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
            else:
                sub_groups.append(prev)
        if not sub_groups:
            sub_groups.append(prev)
    return sub_groups
stiched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]

# print stiched_v, stitched_h


## table_extraction_complete.py
# coding: utf-8
from __future__ import division
import pdfminer
import math
import logging
import traceback
import os
import csv
from collections import defaultdict
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

logging.basicConfig(filename='pdf_table_extraction.log', level=logging.ERROR)

def main(example_file):
    table_pages = []
    def extract_layout_by_page(pdf_path):
        laparams = LAParams()

        fp = open(pdf_path, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        layouts = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layouts.append(device.get_result())

        return layouts
    #
    #
    # example_file = "Emails/CV_Hardik_Parmar.pdf"
    # example_file = "docs_converted_to_pdfs/1048889Rahul.pdf"
    # # example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf"
    page_layouts = extract_layout_by_page(example_file)


    objects_on_page = set(type(o) for pg_layout in page_layouts for o in pg_layout)
    objects_on_page


    TEXT_ELEMENTS = [
        pdfminer.layout.LTTextBox,
        pdfminer.layout.LTTextBoxHorizontal,
        pdfminer.layout.LTTextLine,
        pdfminer.layout.LTTextLineHorizontal
    ]

    def flatten(lst):
        """Flattens a list of lists"""
        return [subelem for elem in lst for subelem in elem]


    def extract_characters(element):
        """
        Recursively extracts individual characters from
        text elements.
        """
        if isinstance(element, pdfminer.layout.LTChar):
            return [element]

        if any(isinstance(element, i) for i in TEXT_ELEMENTS):
            return flatten([extract_characters(e) for e in element])

        if isinstance(element, list):
            return flatten([extract_characters(l) for l in element])

        return []


    for current_page in page_layouts:
        texts = []
        rects = []
        lines_list = []
        for e in current_page:
            if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
                texts.append(e)
            elif isinstance(e, pdfminer.layout.LTRect):
                rects.append(e)
            elif isinstance(e, pdfminer.layout.LTCurve) or isinstance(e, pdfminer.layout.LTCurve):
                lines_list.append(e)

        characters = extract_characters(texts)
# -----------------------------------------------------------------------------------------------------------
        threshold_slope_in_pixels = 10  # Not accurate - since pixel tolerance decreases with the length of the line!
        # filter(lambda x: )
        horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
        vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)

        horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3]) / 2)
        vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2]) / 2)
        # def group_lines(vertical_lines, horizontal_lines):
        #     verticals = []
        #     horizontals = []
        # diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
        # diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))

        # print diff_vertical_lines, diff_horizontal_lines

        tolerance = 5
        grouped_vertical_lines = []
        grouped_horizontal_lines = []

        # iter_vertical_lines = iter(vertical_lines)
        # iter_horizontal_lines = iter(horizontal_lines)
        prev = None
        tmp = []
        for line in vertical_lines:
            print line
            if not prev:
                prev = line
                tmp.append(line)
                continue
            if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
                tmp.append(line)
                prev = line
            else:
                grouped_vertical_lines.append(tmp)
                tmp = [line]
                prev = line
        grouped_vertical_lines.append(tmp)

        tmp = []
        prev = None
        for line in horizontal_lines:
            if not prev:
                prev = line
                tmp.append(line)
                continue
            if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
                tmp.append(line)
                prev = line
            else:
                grouped_horizontal_lines.append(tmp)
                tmp = [line]
                prev = line
        grouped_horizontal_lines.append(tmp)

        def sort_along_axis(lines, axis):
            if axis == 'h':
                return sorted(lines, key=lambda x: x.bbox[0])
            elif axis == 'v':
                return sorted(lines, key=lambda x: x.bbox[1])
            return

        grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
        grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]

        def stitch_lines(lines, axis, threshold=10):
            stitched_lines = []
            if axis == 'H':
                prev = None
                sub_groups = []
                for line in lines:
                    #             print line
                    if not prev:
                        prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
                        continue
                    if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
                        prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
                    else:
                        sub_groups.append(prev)
                if not sub_groups:
                    sub_groups.append(prev)
            elif axis == 'V':
                prev = None
                sub_groups = []
                for line in lines:
                    if not prev:
                        prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
                        #                 print 'hehhee'
                        continue
                    if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
                        prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
                    else:
                        sub_groups.append(prev)
                if not sub_groups:
                    sub_groups.append(prev)
            return sub_groups

        stitched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
        stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]

        # print stitched_v, stitched_h
        if not any(stitched_h):
            stitched_h = []
        if not any(stitched_v):
            stitched_v = []
# ----------------------------------------------------------------------------------------------

        def does_it_intersect(x, (xmin, xmax)):
            return (x <= xmax and x >= xmin)

        def find_bounding_rectangle((x, y), lines):

            v_intersects = [l for l in lines
                            if l[4] == "V"
                            and does_it_intersect(y, (l[1], l[3]))]

            h_intersects = [l for l in lines
                            if l[4] == "H"
                            and does_it_intersect(x, (l[0], l[2]))]

            if len(v_intersects) < 2 or len(h_intersects) < 2:
                return None

            v_left = [v[0] for v in v_intersects
                      if v[0] < x]

            v_right = [v[0] for v in v_intersects
                       if v[0] > x]

            if len(v_left) == 0 or len(v_right) == 0:
                return None

            x0, x1 = max(v_left), min(v_right)

            h_down = [h[1] for h in h_intersects
                      if h[1] < y]

            h_up = [h[1] for h in h_intersects
                    if h[1] > y]

            if len(h_down) == 0 or len(h_up) == 0:
                return None

            y0, y1 = max(h_down), min(h_up)

            return (x0, y0, x1, y1)

        def width(rect):
            x0, y0, x1, y1 = rect.bbox
            return min(x1 - x0, y1 - y0)

        def area(rect):
            x0, y0, x1, y1 = rect.bbox
            return (x1 - x0) * (y1 - y0)

        def cast_as_line(rect):
            """
            Replaces a retangle with a line based on its longest dimension.
            """
            x0, y0, x1, y1 = rect.bbox

            if x1 - x0 > y1 - y0:
                return (x0, y0, x1, y0, "H")
            else:
                return (x0, y0, x0, y1, "V")


        lines = [cast_as_line(r) for r in rects
                 if width(r) < 2 and
                 area(r) > 1] + stitched_v + stitched_h
        box_char_dict = {}

        for c in characters:
            # choose the bounding box that occurs the majority of times for each of these:
            bboxes = defaultdict(int)
            l_x, l_y = c.bbox[0], c.bbox[1]
            bbox_l = find_bounding_rectangle((l_x, l_y), lines)
            bboxes[bbox_l] += 1

            c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2)
            bbox_c = find_bounding_rectangle((c_x, c_y), lines)
            bboxes[bbox_c] += 1

            u_x, u_y = c.bbox[2], c.bbox[3]
            bbox_u = find_bounding_rectangle((u_x, u_y), lines)
            bboxes[bbox_u] += 1

            # if all values are in different boxes, default to character center.
            # otherwise choose the majority.
            if max(bboxes.values()) == 1:
                bbox = bbox_c
            else:
                bbox = max(bboxes.items(), key=lambda x: x[1])[0]

            if bbox is None:
                continue

            if bbox in box_char_dict.keys():
                box_char_dict[bbox].append(c)
                continue

            box_char_dict[bbox] = [c]


        xmin, ymin, xmax, ymax = current_page.bbox

        for x in range(int(xmin), int(xmax), 10):
            for y in range(int(ymin), int(ymax), 10):
                bbox = find_bounding_rectangle((x, y), lines)

                if bbox is None:
                    continue

                if bbox in box_char_dict.keys():
                    continue

                box_char_dict[bbox] = []

        def chars_to_string(chars):
            """
            Converts a collection of characters into a string, by ordering them left to right,
            then top to bottom.
            """
            if not chars:
                return ""
            rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True)
            text = ""
            for row in rows:
                sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0])
                text += "".join(c.get_text() for c in sorted_row)
            return text

        def boxes_to_table(box_record_dict):
            """
            Converts a dictionary of cell:characters mapping into a python list
            of lists of strings. Tries to split cells into rows, then for each row
            breaks it down into columns.
            """
            boxes = box_record_dict.keys()
            rows = sorted(list(set(b[1] for b in boxes)), reverse=True)
            table = []
            for row in rows:
                sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0])
                table.append([chars_to_string(box_record_dict[b]) for b in sorted_row])
            return table
        table_pages.append(boxes_to_table(box_char_dict))

    return table_pages

def get_all_pdf_file_paths(root_path):
    return (os.path.abspath(os.path.join(root_, doc_files)) for root_, dir_, files in os.walk(root_path)
            for doc_files in filter(lambda x: x.endswith('.pdf'), files))
#
# print main('docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf')
# print main('/home/wolfram/project@work/resume-intent-classification/resumes_data/Lalit Kumar Pati_270716.pdf')

with open('tables_from_pdf_revised.csv', 'w') as f:
    writer = csv.writer(f)
    for file_p in get_all_pdf_file_paths('resumes_data'):
        try:
            table = main(file_p)
            writer.writerow([file_p, table])
            print file_p, table
        except Exception as e:
            print "errror", file_p, e, traceback.format_exc()
            logging.error('FILE_PATH %s \n %s' % (file_p, traceback.format_exc()))
	import os
	import subprocess
	import shlex
	import time
	def convert_doc_to_pdf(ipfile_path, opfile_path):
	subprocess.Popen(shlex.split('libreoffice --headless --convert-to pdf "{}" --outdir "{}"'.format(ipfile_path, opfile_path)), stdout=True)
	threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line!
	# filter(lambda x: )
	horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
	vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)

	horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3])/2)
	vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2])/2)
	# def group_lines(vertical_lines, horizontal_lines):
	# verticals = []
	# horizontals = []
	# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
	# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))

	# print diff_vertical_lines, diff_horizontal_lines

	tolerance = 5
	grouped_vertical_lines = []
	grouped_horizontal_lines = []

	# iter_vertical_lines = iter(vertical_lines)
	# iter_horizontal_lines = iter(horizontal_lines)
	prev = None
	tmp = []
	for line in vertical_lines:
	print line
	if not prev:
	prev = line
	tmp.append(line)
	continue
	if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
	tmp.append(line)
	prev = line
	else:
	grouped_vertical_lines.append(tmp)
	tmp = [line]
	prev = line
	grouped_vertical_lines.append(tmp)

	tmp = []
	prev = None
	for line in horizontal_lines:
	if not prev:
	prev = line
	tmp.append(line)
	continue
	if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
	tmp.append(line)
	prev = line
	else:
	grouped_horizontal_lines.append(tmp)
	tmp = [line]
	prev = line
	grouped_horizontal_lines.append(tmp)

	def sort_along_axis(lines, axis):
	if axis == 'h':
	return sorted(lines, key=lambda x: x.bbox[0])
	elif axis == 'v':
	return sorted(lines, key=lambda x: x.bbox[1])
	return

	grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
	grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]

	def stitch_lines(lines, axis, threshold=10):
	stitched_lines = []
	if axis == 'H':
	prev = None
	sub_groups = []
	for line in lines:
	# print line
	if not prev:
	prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
	continue
	if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
	prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
	else:
	sub_groups.append(prev)
	if not sub_groups:
	sub_groups.append(prev)
	elif axis == 'V':
	prev = None
	sub_groups = []
	for line in lines:
	if not prev:
	prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
	# print 'hehhee'
	continue
	if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
	prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
	else:
	sub_groups.append(prev)
	if not sub_groups:
	sub_groups.append(prev)
	return sub_groups
	stiched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
	stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]

	# print stiched_v, stitched_h
	# coding: utf-8
	from __future__ import division
	import pdfminer
	import math
	import logging
	import traceback
	import os
	import csv
	from collections import defaultdict
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdfpage import PDFTextExtractionNotAllowed
	from pdfminer.pdfinterp import PDFResourceManager
	from pdfminer.pdfinterp import PDFPageInterpreter
	from pdfminer.layout import LAParams
	from pdfminer.converter import PDFPageAggregator

	logging.basicConfig(filename='pdf_table_extraction.log', level=logging.ERROR)

	def main(example_file):
	table_pages = []
	def extract_layout_by_page(pdf_path):
	laparams = LAParams()

	fp = open(pdf_path, 'rb')
	parser = PDFParser(fp)
	document = PDFDocument(parser)

	if not document.is_extractable:
	raise PDFTextExtractionNotAllowed

	rsrcmgr = PDFResourceManager()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	layouts = []
	for page in PDFPage.create_pages(document):
	interpreter.process_page(page)
	layouts.append(device.get_result())

	return layouts
	#
	#
	# example_file = "Emails/CV_Hardik_Parmar.pdf"
	# example_file = "docs_converted_to_pdfs/1048889Rahul.pdf"
	# # example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf"
	page_layouts = extract_layout_by_page(example_file)


	objects_on_page = set(type(o) for pg_layout in page_layouts for o in pg_layout)
	objects_on_page




	TEXT_ELEMENTS = [
	pdfminer.layout.LTTextBox,
	pdfminer.layout.LTTextBoxHorizontal,
	pdfminer.layout.LTTextLine,
	pdfminer.layout.LTTextLineHorizontal
	]

	def flatten(lst):
	"""Flattens a list of lists"""
	return [subelem for elem in lst for subelem in elem]


	def extract_characters(element):
	"""
	Recursively extracts individual characters from
	text elements.
	"""
	if isinstance(element, pdfminer.layout.LTChar):
	return [element]

	if any(isinstance(element, i) for i in TEXT_ELEMENTS):
	return flatten([extract_characters(e) for e in element])

	if isinstance(element, list):
	return flatten([extract_characters(l) for l in element])

	return []


	for current_page in page_layouts:
	texts = []
	rects = []
	lines_list = []
	for e in current_page:
	if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
	texts.append(e)
	elif isinstance(e, pdfminer.layout.LTRect):
	rects.append(e)
	elif isinstance(e, pdfminer.layout.LTCurve) or isinstance(e, pdfminer.layout.LTCurve):
	lines_list.append(e)

	characters = extract_characters(texts)
	# -----------------------------------------------------------------------------------------------------------
	threshold_slope_in_pixels = 10 # Not accurate - since pixel tolerance decreases with the length of the line!
	# filter(lambda x: )
	horizontal_lines = filter(lambda x: abs(x.bbox[1] - x.bbox[3]) < threshold_slope_in_pixels, lines_list)
	vertical_lines = filter(lambda x: abs(x.bbox[0] - x.bbox[2]) < threshold_slope_in_pixels, lines_list)

	horizontal_lines = sorted(horizontal_lines, key=lambda x: (x.bbox[1] + x.bbox[3]) / 2)
	vertical_lines = sorted(vertical_lines, key=lambda x: (x.bbox[0] + x.bbox[2]) / 2)
	# def group_lines(vertical_lines, horizontal_lines):
	# verticals = []
	# horizontals = []
	# diff_vertical_lines = (t.bbox[0] - s.bbox[2] for s, t in zip(vertical_lines, vertical_lines[1:]))
	# diff_horizontal_lines = (t.bbox[1] - s.bbox[3] for s, t in zip(horizontal_lines, horizontal_lines[1:]))

	# print diff_vertical_lines, diff_horizontal_lines

	tolerance = 5
	grouped_vertical_lines = []
	grouped_horizontal_lines = []

	# iter_vertical_lines = iter(vertical_lines)
	# iter_horizontal_lines = iter(horizontal_lines)
	prev = None
	tmp = []
	for line in vertical_lines:
	print line
	if not prev:
	prev = line
	tmp.append(line)
	continue
	if abs(prev.bbox[0] - line.bbox[0]) < tolerance:
	tmp.append(line)
	prev = line
	else:
	grouped_vertical_lines.append(tmp)
	tmp = [line]
	prev = line
	grouped_vertical_lines.append(tmp)

	tmp = []
	prev = None
	for line in horizontal_lines:
	if not prev:
	prev = line
	tmp.append(line)
	continue
	if abs(prev.bbox[1] - line.bbox[1]) < tolerance:
	tmp.append(line)
	prev = line
	else:
	grouped_horizontal_lines.append(tmp)
	tmp = [line]
	prev = line
	grouped_horizontal_lines.append(tmp)

	def sort_along_axis(lines, axis):
	if axis == 'h':
	return sorted(lines, key=lambda x: x.bbox[0])
	elif axis == 'v':
	return sorted(lines, key=lambda x: x.bbox[1])
	return

	grouped_horizontal_lines = [sort_along_axis(group, axis='h') for group in grouped_horizontal_lines]
	grouped_vertical_lines = [sort_along_axis(group, axis='v') for group in grouped_vertical_lines]

	def stitch_lines(lines, axis, threshold=10):
	stitched_lines = []
	if axis == 'H':
	prev = None
	sub_groups = []
	for line in lines:
	# print line
	if not prev:
	prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'H')
	continue
	if prev[2] > line.bbox[0] or (line.bbox[0] - prev[2]) < threshold:
	prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'H')
	else:
	sub_groups.append(prev)
	if not sub_groups:
	sub_groups.append(prev)
	elif axis == 'V':
	prev = None
	sub_groups = []
	for line in lines:
	if not prev:
	prev = (line.bbox[0], line.bbox[1], line.bbox[2], line.bbox[3], 'V')
	# print 'hehhee'
	continue
	if prev[3] > line.bbox[1] or (line.bbox[1] - prev[3]) < threshold:
	prev = (prev[0], prev[1], line.bbox[2], line.bbox[3], 'V')
	else:
	sub_groups.append(prev)
	if not sub_groups:
	sub_groups.append(prev)
	return sub_groups

	stitched_v = [line for lines in grouped_vertical_lines for line in stitch_lines(lines, 'V')]
	stitched_h = [line for lines in grouped_horizontal_lines for line in stitch_lines(lines, 'H')]

	# print stitched_v, stitched_h
	if not any(stitched_h):
	stitched_h = []
	if not any(stitched_v):
	stitched_v = []
	# ----------------------------------------------------------------------------------------------

	def does_it_intersect(x, (xmin, xmax)):
	return (x <= xmax and x >= xmin)

	def find_bounding_rectangle((x, y), lines):

	v_intersects = [l for l in lines
	if l[4] == "V"
	and does_it_intersect(y, (l[1], l[3]))]

	h_intersects = [l for l in lines
	if l[4] == "H"
	and does_it_intersect(x, (l[0], l[2]))]

	if len(v_intersects) < 2 or len(h_intersects) < 2:
	return None

	v_left = [v[0] for v in v_intersects
	if v[0] < x]

	v_right = [v[0] for v in v_intersects
	if v[0] > x]

	if len(v_left) == 0 or len(v_right) == 0:
	return None

	x0, x1 = max(v_left), min(v_right)

	h_down = [h[1] for h in h_intersects
	if h[1] < y]

	h_up = [h[1] for h in h_intersects
	if h[1] > y]

	if len(h_down) == 0 or len(h_up) == 0:
	return None

	y0, y1 = max(h_down), min(h_up)

	return (x0, y0, x1, y1)

	def width(rect):
	x0, y0, x1, y1 = rect.bbox
	return min(x1 - x0, y1 - y0)

	def area(rect):
	x0, y0, x1, y1 = rect.bbox
	return (x1 - x0) * (y1 - y0)

	def cast_as_line(rect):
	"""
	Replaces a retangle with a line based on its longest dimension.
	"""
	x0, y0, x1, y1 = rect.bbox

	if x1 - x0 > y1 - y0:
	return (x0, y0, x1, y0, "H")
	else:
	return (x0, y0, x0, y1, "V")


	lines = [cast_as_line(r) for r in rects
	if width(r) < 2 and
	area(r) > 1] + stitched_v + stitched_h
	box_char_dict = {}

	for c in characters:
	# choose the bounding box that occurs the majority of times for each of these:
	bboxes = defaultdict(int)
	l_x, l_y = c.bbox[0], c.bbox[1]
	bbox_l = find_bounding_rectangle((l_x, l_y), lines)
	bboxes[bbox_l] += 1

	c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2)
	bbox_c = find_bounding_rectangle((c_x, c_y), lines)
	bboxes[bbox_c] += 1

	u_x, u_y = c.bbox[2], c.bbox[3]
	bbox_u = find_bounding_rectangle((u_x, u_y), lines)
	bboxes[bbox_u] += 1

	# if all values are in different boxes, default to character center.
	# otherwise choose the majority.
	if max(bboxes.values()) == 1:
	bbox = bbox_c
	else:
	bbox = max(bboxes.items(), key=lambda x: x[1])[0]

	if bbox is None:
	continue

	if bbox in box_char_dict.keys():
	box_char_dict[bbox].append(c)
	continue

	box_char_dict[bbox] = [c]




	xmin, ymin, xmax, ymax = current_page.bbox

	for x in range(int(xmin), int(xmax), 10):
	for y in range(int(ymin), int(ymax), 10):
	bbox = find_bounding_rectangle((x, y), lines)

	if bbox is None:
	continue

	if bbox in box_char_dict.keys():
	continue

	box_char_dict[bbox] = []

	def chars_to_string(chars):
	"""
	Converts a collection of characters into a string, by ordering them left to right,
	then top to bottom.
	"""
	if not chars:
	return ""
	rows = sorted(list(set(c.bbox[1] for c in chars)), reverse=True)
	text = ""
	for row in rows:
	sorted_row = sorted([c for c in chars if c.bbox[1] == row], key=lambda c: c.bbox[0])
	text += "".join(c.get_text() for c in sorted_row)
	return text

	def boxes_to_table(box_record_dict):
	"""
	Converts a dictionary of cell:characters mapping into a python list
	of lists of strings. Tries to split cells into rows, then for each row
	breaks it down into columns.
	"""
	boxes = box_record_dict.keys()
	rows = sorted(list(set(b[1] for b in boxes)), reverse=True)
	table = []
	for row in rows:
	sorted_row = sorted([b for b in boxes if b[1] == row], key=lambda b: b[0])
	table.append([chars_to_string(box_record_dict[b]) for b in sorted_row])
	return table
	table_pages.append(boxes_to_table(box_char_dict))

	return table_pages

	def get_all_pdf_file_paths(root_path):
	return (os.path.abspath(os.path.join(root_, doc_files)) for root_, dir_, files in os.walk(root_path)
	for doc_files in filter(lambda x: x.endswith('.pdf'), files))
	#
	# print main('docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf')
	# print main('/home/wolfram/project@work/resume-intent-classification/resumes_data/Lalit Kumar Pati_270716.pdf')

	with open('tables_from_pdf_revised.csv', 'w') as f:
	writer = csv.writer(f)
	for file_p in get_all_pdf_file_paths('resumes_data'):
	try:
	table = main(file_p)
	writer.writerow([file_p, table])
	print file_p, table
	except Exception as e:
	print "errror", file_p, e, traceback.format_exc()
	logging.error('FILE_PATH %s \n %s' % (file_p, traceback.format_exc()))