Last active
March 15, 2018 02:06
-
-
Save atuyosi/55eaa9861da95bff434685cb3180c4de to your computer and use it in GitHub Desktop.
Convert PDF to TSV ( for Japan's MHLW illegal company list ) rev. 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# encoding: utf-8 | |
# | |
import sys | |
import math | |
import re | |
from operator import attrgetter | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter, PDFPageAggregator | |
from pdfminer.layout import LAParams | |
from pdfminer.layout import LTTextContainer, LTTextBox, LTLine | |
from pdfminer.pdfpage import PDFPage | |
DEBUG = False | |
QUOTE = False | |
# magick number | |
range_violation_col = (320,480) | |
range_first_col = ( 35, 179 ) | |
range_column_last = (705, 790 ) | |
#column_count = 6 # 列の数 | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams() | |
# laparams.char_margin = 0.5 | |
laparams.word_margin = 0.07 | |
#laparams.detect_vertical = True | |
def format_list_cell(node) : | |
temp_str = node.get_text().rstrip("\n") | |
# 違反した条文を列挙しているカラムの処理 | |
if "条" in temp_str : | |
src_str = re.sub(r'(?<=(条|\d))\n(?!(の|$))', ',', temp_str) | |
else: | |
src_str = temp_str.replace("\n",',') | |
str = src_str.replace("\n",'') | |
return str | |
def remove_returncode(node) : | |
src_str = node.get_text() | |
str = src_str.replace("\n", '') | |
if DEBUG : | |
print(str, file=sys.stderr) | |
return str | |
def remove_returncode_lastcolumn(node) : | |
src_str = node.get_text() | |
if 'H' in src_str : | |
src_str = re.sub(r'\n(?=H)', ',', src_str) | |
# 半角スペースの除去 | |
if ' ' in src_str : | |
src_str = re.sub(r' ', '', src_str) | |
str = src_str.replace("\n", '') | |
if DEBUG : | |
print(str, file=sys.stderr) | |
return str | |
def listup_horizontal_range(nodelist): | |
horizontal_point_list = [ n.y0 for n in nodelist if (n.y0 == n.y1) ] | |
if DEBUG : | |
print(horizontal_point_list, file=sys.stderr) | |
return horizontal_point_list | |
def parse_and_output(page, page_number) : | |
if page_number == debug_page : | |
DEBUG = True | |
else: | |
DEBUG = False | |
cells = list() | |
lineobjlist = [ node for node in layout if issubclass(node.__class__, (LTLine) ) ] | |
hpoint_range = listup_horizontal_range(lineobjlist) | |
if DEBUG: | |
print(hpoint_range, file=sys.stderr) | |
for node in layout: | |
if ( not issubclass(node.__class__ ,(LTTextBox, LTTextContainer ) ) ): | |
continue | |
else: | |
temp_str = node.get_text().rstrip("\n") | |
if temp_str.startswith("最終更新日") : | |
last_modified_date = temp_str[6:] | |
if DEBUG : | |
print(last_modified_date, file=sys.stderr) | |
continue | |
if temp_str.endswith("労働局") : | |
dept_labor = temp_str | |
continue | |
if temp_str.endswith("公表事案") : | |
continue | |
cells.append(node) | |
filtered_cells = list(filter(lambda c: not(c.get_text().rstrip() in header_text) , cells)) | |
if DEBUG : | |
print("filtered:" , filtered_cells, file=sys.stderr) | |
header_cells = list(filter(lambda c: c.get_text().rstrip() in header_text , cells)) | |
temp_cells = sorted(filtered_cells, key = attrgetter('x0'), reverse=False) | |
node_group = list() | |
range_list = [ i for i in zip(hpoint_range[1:], hpoint_range[2:]) ] | |
for r in range_list: | |
temp = [ node for node in temp_cells if ( node.y0 < r[0] and node.y1 > r[1]) ] | |
node_group.append(temp) | |
if DEBUG : | |
for n in node_group : | |
print(n,file=sys.stderr) | |
columns = list() | |
for cells in node_group : | |
for cell in cells : | |
if DEBUG: | |
print(cell, file=sys.stderr) | |
center_x = ( cell.x0 + cell.x1 ) / 2.0 | |
if ( center_x > range_first_col[0] and cell.x0 < range_first_col[1] ) : | |
# 先頭 | |
col_str = remove_returncode(cell) | |
if ( cell.x1 < range_first_col[1] and ' ' in col_str ) : | |
col_str = re.sub(' +', ' ', col_str) | |
print(col_str, file=sys.stderr) | |
columns.insert(0, col_str) | |
elif ( center_x > range_column_last[0] and center_x < range_column_last[1] ): | |
# 最終カラム | |
col_str = remove_returncode_lastcolumn(cell) | |
columns.append(col_str) | |
# 出力 | |
# clean up | |
purificated = list() | |
for st in columns: | |
if DEBUG: | |
print(st, file=sys.stderr) | |
if st.endswith(" ") : | |
st = st.rstrip(" ") # 末尾の全角空白を削除 | |
if re.search(' ',st): | |
# 半角空白の連続は全角スペースに | |
st = st.replace(' ',"\u3000") | |
# 正常にパースできなかったセルを半角スペースで分割 | |
st = st.rstrip(' ') | |
if (' ' in st ) : | |
# temp = st.split(' ') | |
if re.search('(都|道|府|県).+(市|町|村)' ,st) : | |
if '条' in st : | |
# カラムが4つ結合しているケース | |
temp = st.rsplit(" ",3) | |
elif 'H' in st : | |
# カラムが3つ結合しているケース | |
temp = st.rsplit(" ",2) | |
else: | |
# カラムが2つ結合しているケース | |
temp = st.rsplit(" ",1) | |
else: | |
temp = re.split('(?<!)) ',st) | |
for ts in temp : | |
purificated.append(ts) | |
else: | |
purificated.append(st) | |
output = dept_labor + "\t" + last_modified_date + "\t" + "\t".join(purificated) | |
if QUOTE: | |
temp = list(map( lambda x: '"%s"' % x, output.split("\t") )) | |
output = "\t".join(temp) | |
print(output) | |
columns = list() | |
else: | |
# それ以外(先頭でも最終カラムでもないセル) | |
if center_x > range_violation_col[0] and center_x < range_violation_col[1] : | |
# セルの中身が複数行 | |
col_str = format_list_cell(cell) | |
else: | |
col_str = remove_returncode(cell) | |
columns.append(col_str) | |
#if __name__ == '__main__': | |
# test() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
debug_page = None | |
if DEBUG: | |
print(sys.argv, file=sys.stderr) | |
if ( len(sys.argv) > 1 ): | |
filename = sys.argv[1] | |
else: | |
print("Please specfy input pdf filename", file=sys.stderr) | |
exit(-1) | |
# 処理するPDFを開く | |
fp = open(filename, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
parser = PDFParser(fp) | |
document = PDFDocument(parser, '') | |
dept_labor = "" # 都道府県労働局 | |
last_modified_date = "" | |
header_text = ["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項" ] | |
for page in PDFPage.create_pages(document): | |
interpreter.process_page(page) | |
# page.contents | |
layout = device.get_result() | |
if DEBUG : | |
print(layout.pageid, file=sys.stderr) | |
# 1ページ目をスキップ | |
page_number = layout.pageid | |
if (layout.pageid == 1 ) : | |
continue | |
parse_and_output(page, page_number) | |
fp.close() | |
device.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment