Last active
May 14, 2022 01:32
-
-
Save takemikami/7ac487f664a72cc25b49229b535b4c9e to your computer and use it in GitHub Desktop.
PDFにoutlineをつけるスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PDFにoutlineをつけるスクリプト | |
# | |
# 概要: | |
# PDFファイルの本文中にある目次の文字列を解析し、 | |
# 解析結果を元にPDFにアウトラインを設定する | |
# セットアップ: | |
# pip install pdfminer.six | |
# pip install pdfrw | |
# pip install reportlab | |
# 実行方法: | |
# 1. input.pdfファイルをカレントディレクトリに配置 | |
# 2. スクリプトを実行 | |
# python add_outline_to_pdf.py | |
# パラメータの設定: | |
# 本スクリプトの「設定項目」以降にある変数で指定する | |
# 参考: | |
# https://inudaisho.hatenablog.com/entry/20120611 | |
# https://buildersbox.corp-sansan.com/entry/2020/06/09/110000 | |
import io | |
import re | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.pdfpage import PDFPage | |
from pdfrw import PdfReader | |
from pdfrw.buildxobj import pagexobj | |
from pdfrw.toreportlab import makerl | |
from reportlab.pdfgen.canvas import Canvas | |
# 設定項目 | |
# 入力ファイル名 | |
input_pdf = "input.pdf" | |
# 出力ファイル名 | |
output_pdf = "out.pdf" | |
# 1ページ目の開始位置 (表紙分などページ開始位置をずらす) | |
page_shift = 0 | |
# 目次のリーダー線 (ここで指定した文字列があるページ・行を目次として扱う) | |
leader_char = '....' | |
# 目次を解析する正規表現、対象レベル分指定 | |
re_list = [ | |
r'第\s+([0-9]*)\s+章\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+', | |
r'([0-9]+\.[0-9]+)\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+', | |
r'([0-9.]*)\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+', | |
] | |
# 目次の解析 | |
def parse_outline(input_file: str, leader_str: str, re_levels: list): | |
rsrcmgr = PDFResourceManager() | |
outlines = [] | |
with open(input_file, "rb") as fp: | |
index_page = False | |
for pidx, page in enumerate(PDFPage.get_pages(fp)): | |
out_fp = io.StringIO() | |
device = TextConverter( | |
rsrcmgr, | |
out_fp, | |
laparams=LAParams(), | |
imagewriter=None | |
) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
interpreter.process_page(page) | |
out_fp.seek(0) | |
page_str = out_fp.read() | |
if leader_str in page_str: | |
index_page = True | |
for ln in page_str.split('\n'): | |
for lv, re_lv in enumerate(re_levels): | |
m = re.match(re_lv, ln) | |
if m is None: | |
continue | |
title_num, title_str, page_num = m.groups() | |
title = "{} {}".format(title_num, title_str) | |
outlines.append([title, int(page_num) - 1, lv + 1]) | |
break | |
elif index_page: | |
break | |
return outlines | |
# 目次の設定 | |
def add_outline(input_file: str, output_file: str, outlines: list, page_shift: int = 0): | |
pages = PdfReader(input_file, decompress=False).pages | |
out_canvas = Canvas(output_file) | |
out_canvas.bookmarkPage("0") | |
out_canvas.addOutlineEntry(u"目次", "0") | |
for idx, page in enumerate(pages): | |
out_page = pagexobj(page) | |
out_canvas.setPageSize(tuple(out_page.BBox[2:])) | |
out_canvas.doForm(makerl(out_canvas, out_page)) | |
target_outlines = [d for d in outlines if idx + page_shift == int(d[1])] | |
for bookmark_idx, outline_data in enumerate(target_outlines): | |
out_bookmark = str(idx) + "p-" + str(bookmark_idx) | |
out_canvas.bookmarkPage(out_bookmark) | |
out_canvas.addOutlineEntry(outline_data[0], out_bookmark, outline_data[2]) | |
out_canvas.showPage() | |
out_canvas.showOutline() | |
out_canvas.save() | |
if __name__ == '__main__': | |
outline_list = parse_outline(input_pdf, leader_char, re_list) | |
add_outline(input_pdf, output_pdf, outline_list, page_shift) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment