atuyosi/extract_pdf2tsv.py

## extract_pdf2tsv.py
#! /usr/bin/env python
# encoding: utf-8
#

import sys
import math
import re
from operator import attrgetter

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.layout import LTTextContainer, LTTextBox, LTLine
from pdfminer.pdfpage import PDFPage

DEBUG = False
QUOTE = False

# magick number
range_violation_col = (320,480)
range_first_col = ( 35, 179 )
range_column_last = (705, 790 )
#column_count = 6 # 列の数


rsrcmgr = PDFResourceManager()
laparams = LAParams()

# laparams.char_margin = 0.5
laparams.word_margin = 0.07


#laparams.detect_vertical = True


def format_list_cell(node) :
    temp_str = node.get_text().rstrip("\n")

    # 違反した条文を列挙しているカラムの処理
    if "条" in temp_str :
        src_str = re.sub(r'(?<=(条|\d))\n(?!(の|$))', ',', temp_str)
    else:
        src_str = temp_str.replace("\n",',')


    str = src_str.replace("\n",'')
    return str

def remove_returncode(node) :
    src_str = node.get_text()

    str = src_str.replace("\n", '')


    if DEBUG :
        print(str, file=sys.stderr)

    return str


def remove_returncode_lastcolumn(node) :
    src_str = node.get_text()

    if 'H' in src_str :
        src_str = re.sub(r'\n(?=H)', ',', src_str)


    # 半角スペースの除去
    if ' ' in src_str :
        src_str = re.sub(r' ', '', src_str)

    str = src_str.replace("\n", '')


    if DEBUG :
        print(str, file=sys.stderr)

    return str

def listup_horizontal_range(nodelist):
    horizontal_point_list  = [ n.y0 for n in nodelist if (n.y0 == n.y1)  ]
    if DEBUG :
        print(horizontal_point_list, file=sys.stderr)

    return horizontal_point_list


def parse_and_output(page, page_number) :

    if page_number == debug_page :
        DEBUG = True
    else:
        DEBUG = False

    cells = list()

    lineobjlist = [ node for node in layout if issubclass(node.__class__, (LTLine) ) ]

    hpoint_range = listup_horizontal_range(lineobjlist)


    if DEBUG:
        print(hpoint_range, file=sys.stderr)


    for node in layout:
        if ( not issubclass(node.__class__ ,(LTTextBox, LTTextContainer ) ) ):
            continue
        else:

            temp_str = node.get_text().rstrip("\n")
            if temp_str.startswith("最終更新日") :
                last_modified_date = temp_str[6:]
                if DEBUG :
                    print(last_modified_date, file=sys.stderr)
                continue
            if temp_str.endswith("労働局") :
                dept_labor = temp_str
                continue
            if temp_str.endswith("公表事案") :
                continue

            cells.append(node)

    filtered_cells = list(filter(lambda c: not(c.get_text().rstrip() in header_text) , cells))
    if DEBUG :
        print("filtered:" , filtered_cells, file=sys.stderr)

    header_cells = list(filter(lambda c: c.get_text().rstrip() in header_text , cells))


    temp_cells = sorted(filtered_cells, key = attrgetter('x0'), reverse=False)

    node_group = list()

    range_list  = [ i for i in zip(hpoint_range[1:], hpoint_range[2:]) ]

    for r in range_list:
        temp = [ node for node in temp_cells if ( node.y0 < r[0] and node.y1 > r[1]) ]

        node_group.append(temp)


    if DEBUG :
        for n in node_group :
            print(n,file=sys.stderr)


    columns = list()

    for cells  in node_group :
        for cell in cells :

            if DEBUG:
                print(cell, file=sys.stderr)

            center_x = ( cell.x0 + cell.x1 ) / 2.0

            if ( center_x > range_first_col[0] and cell.x0 < range_first_col[1] ) :
                # 先頭
                col_str = remove_returncode(cell)
                if ( cell.x1 < range_first_col[1] and ' ' in col_str ) :
                    col_str = re.sub(' +', '　', col_str)
                    print(col_str, file=sys.stderr)

                columns.insert(0, col_str)

            elif ( center_x > range_column_last[0] and center_x < range_column_last[1] ):
                # 最終カラム
                col_str = remove_returncode_lastcolumn(cell)

                columns.append(col_str)


                # 出力

                # clean up
                purificated = list()
                for st in columns:
                    if DEBUG:
                        print(st, file=sys.stderr)

                    if st.endswith("　") :
                        st = st.rstrip("　") # 末尾の全角空白を削除

                    if re.search('  ',st):
                        # 半角空白の連続は全角スペースに
                        st = st.replace('  ',"\u3000")

                    # 正常にパースできなかったセルを半角スペースで分割
                    st = st.rstrip(' ')
                    if (' ' in st ) :
    #                    temp = st.split(' ')
                        if re.search('(都|道|府|県).+(市|町|村)' ,st) :
                            if '条' in st :
                                # カラムが4つ結合しているケース
                                temp = st.rsplit(" ",3)
                            elif 'H' in st :
                                # カラムが3つ結合しているケース
                                temp = st.rsplit(" ",2)
                            else:
                                # カラムが2つ結合しているケース
                                temp = st.rsplit(" ",1)
                        else:
                            temp = re.split('(?<!）) ',st)

                        for ts in temp :
                            purificated.append(ts)
                    else:
                        purificated.append(st)


                output = dept_labor + "\t" + last_modified_date + "\t" + "\t".join(purificated)
                if QUOTE:
                    temp = list(map( lambda x: '"%s"' % x, output.split("\t") ))
                    output = "\t".join(temp)

                print(output)
                columns = list()


            else:
                # それ以外（先頭でも最終カラムでもないセル）
                if  center_x > range_violation_col[0]  and  center_x < range_violation_col[1]  :
                    # セルの中身が複数行
                    col_str = format_list_cell(cell)
                else:
                    col_str = remove_returncode(cell)

                columns.append(col_str)

#if __name__ == '__main__':
#    test()

device = PDFPageAggregator(rsrcmgr, laparams=laparams)

debug_page = None

if DEBUG:
    print(sys.argv, file=sys.stderr)

if ( len(sys.argv) > 1 ):
    filename = sys.argv[1]
else:
    print("Please specfy input pdf filename", file=sys.stderr)
    exit(-1)

# 処理するPDFを開く
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)


parser = PDFParser(fp)
document = PDFDocument(parser, '')

dept_labor = "" # 都道府県労働局
last_modified_date = ""

header_text = ["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項"  ]

for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # page.contents
    layout = device.get_result()

    if DEBUG :
        print(layout.pageid, file=sys.stderr)

    # １ページ目をスキップ
    page_number = layout.pageid
    if (layout.pageid == 1 ) :
        continue

    parse_and_output(page, page_number)

fp.close()
device.close()
	#! /usr/bin/env python
	# encoding: utf-8
	#

	import sys
	import math
	import re
	from operator import attrgetter

	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument

	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import TextConverter, PDFPageAggregator
	from pdfminer.layout import LAParams
	from pdfminer.layout import LTTextContainer, LTTextBox, LTLine
	from pdfminer.pdfpage import PDFPage

	DEBUG = False
	QUOTE = False

	# magick number
	range_violation_col = (320,480)
	range_first_col = ( 35, 179 )
	range_column_last = (705, 790 )
	#column_count = 6 # 列の数



	rsrcmgr = PDFResourceManager()
	laparams = LAParams()

	# laparams.char_margin = 0.5
	laparams.word_margin = 0.07


	#laparams.detect_vertical = True




	def format_list_cell(node) :
	temp_str = node.get_text().rstrip("\n")

	# 違反した条文を列挙しているカラムの処理
	if "条" in temp_str :
	src_str = re.sub(r'(?<=(条\|\d))\n(?!(の\|$))', ',', temp_str)
	else:
	src_str = temp_str.replace("\n",',')


	str = src_str.replace("\n",'')
	return str

	def remove_returncode(node) :
	src_str = node.get_text()

	str = src_str.replace("\n", '')


	if DEBUG :
	print(str, file=sys.stderr)

	return str


	def remove_returncode_lastcolumn(node) :
	src_str = node.get_text()

	if 'H' in src_str :
	src_str = re.sub(r'\n(?=H)', ',', src_str)


	# 半角スペースの除去
	if ' ' in src_str :
	src_str = re.sub(r' ', '', src_str)

	str = src_str.replace("\n", '')


	if DEBUG :
	print(str, file=sys.stderr)

	return str

	def listup_horizontal_range(nodelist):
	horizontal_point_list = [ n.y0 for n in nodelist if (n.y0 == n.y1) ]
	if DEBUG :
	print(horizontal_point_list, file=sys.stderr)

	return horizontal_point_list


	def parse_and_output(page, page_number) :

	if page_number == debug_page :
	DEBUG = True
	else:
	DEBUG = False

	cells = list()

	lineobjlist = [ node for node in layout if issubclass(node.__class__, (LTLine) ) ]

	hpoint_range = listup_horizontal_range(lineobjlist)


	if DEBUG:
	print(hpoint_range, file=sys.stderr)


	for node in layout:
	if ( not issubclass(node.__class__ ,(LTTextBox, LTTextContainer ) ) ):
	continue
	else:

	temp_str = node.get_text().rstrip("\n")
	if temp_str.startswith("最終更新日") :
	last_modified_date = temp_str[6:]
	if DEBUG :
	print(last_modified_date, file=sys.stderr)
	continue
	if temp_str.endswith("労働局") :
	dept_labor = temp_str
	continue
	if temp_str.endswith("公表事案") :
	continue

	cells.append(node)

	filtered_cells = list(filter(lambda c: not(c.get_text().rstrip() in header_text) , cells))
	if DEBUG :
	print("filtered:" , filtered_cells, file=sys.stderr)

	header_cells = list(filter(lambda c: c.get_text().rstrip() in header_text , cells))


	temp_cells = sorted(filtered_cells, key = attrgetter('x0'), reverse=False)

	node_group = list()

	range_list = [ i for i in zip(hpoint_range[1:], hpoint_range[2:]) ]

	for r in range_list:
	temp = [ node for node in temp_cells if ( node.y0 < r[0] and node.y1 > r[1]) ]

	node_group.append(temp)


	if DEBUG :
	for n in node_group :
	print(n,file=sys.stderr)


	columns = list()

	for cells in node_group :
	for cell in cells :

	if DEBUG:
	print(cell, file=sys.stderr)

	center_x = ( cell.x0 + cell.x1 ) / 2.0

	if ( center_x > range_first_col[0] and cell.x0 < range_first_col[1] ) :
	# 先頭
	col_str = remove_returncode(cell)
	if ( cell.x1 < range_first_col[1] and ' ' in col_str ) :
	col_str = re.sub(' +', '　', col_str)
	print(col_str, file=sys.stderr)

	columns.insert(0, col_str)

	elif ( center_x > range_column_last[0] and center_x < range_column_last[1] ):
	# 最終カラム
	col_str = remove_returncode_lastcolumn(cell)

	columns.append(col_str)


	# 出力

	# clean up
	purificated = list()
	for st in columns:
	if DEBUG:
	print(st, file=sys.stderr)

	if st.endswith("　") :
	st = st.rstrip("　") # 末尾の全角空白を削除

	if re.search(' ',st):
	# 半角空白の連続は全角スペースに
	st = st.replace(' ',"\u3000")

	# 正常にパースできなかったセルを半角スペースで分割
	st = st.rstrip(' ')
	if (' ' in st ) :
	# temp = st.split(' ')
	if re.search('(都\|道\|府\|県).+(市\|町\|村)' ,st) :
	if '条' in st :
	# カラムが4つ結合しているケース
	temp = st.rsplit(" ",3)
	elif 'H' in st :
	# カラムが3つ結合しているケース
	temp = st.rsplit(" ",2)
	else:
	# カラムが2つ結合しているケース
	temp = st.rsplit(" ",1)
	else:
	temp = re.split('(?<!）) ',st)

	for ts in temp :
	purificated.append(ts)
	else:
	purificated.append(st)


	output = dept_labor + "\t" + last_modified_date + "\t" + "\t".join(purificated)
	if QUOTE:
	temp = list(map( lambda x: '"%s"' % x, output.split("\t") ))
	output = "\t".join(temp)

	print(output)
	columns = list()


	else:
	# それ以外（先頭でも最終カラムでもないセル）
	if center_x > range_violation_col[0] and center_x < range_violation_col[1] :
	# セルの中身が複数行
	col_str = format_list_cell(cell)
	else:
	col_str = remove_returncode(cell)

	columns.append(col_str)

	#if __name__ == '__main__':
	# test()

	device = PDFPageAggregator(rsrcmgr, laparams=laparams)

	debug_page = None

	if DEBUG:
	print(sys.argv, file=sys.stderr)

	if ( len(sys.argv) > 1 ):
	filename = sys.argv[1]
	else:
	print("Please specfy input pdf filename", file=sys.stderr)
	exit(-1)

	# 処理するPDFを開く
	fp = open(filename, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)


	parser = PDFParser(fp)
	document = PDFDocument(parser, '')

	dept_labor = "" # 都道府県労働局
	last_modified_date = ""

	header_text = ["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項" ]

	for page in PDFPage.create_pages(document):
	interpreter.process_page(page)
	# page.contents
	layout = device.get_result()

	if DEBUG :
	print(layout.pageid, file=sys.stderr)

	# １ページ目をスキップ
	page_number = layout.pageid
	if (layout.pageid == 1 ) :
	continue

	parse_and_output(page, page_number)

	fp.close()
	device.close()