Skip to content

Instantly share code, notes, and snippets.

@takemikami
Last active May 14, 2022 01:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save takemikami/7ac487f664a72cc25b49229b535b4c9e to your computer and use it in GitHub Desktop.
Save takemikami/7ac487f664a72cc25b49229b535b4c9e to your computer and use it in GitHub Desktop.
PDFにoutlineをつけるスクリプト
# PDFにoutlineをつけるスクリプト
#
# 概要:
# PDFファイルの本文中にある目次の文字列を解析し、
# 解析結果を元にPDFにアウトラインを設定する
# セットアップ:
# pip install pdfminer.six
# pip install pdfrw
# pip install reportlab
# 実行方法:
# 1. input.pdfファイルをカレントディレクトリに配置
# 2. スクリプトを実行
# python add_outline_to_pdf.py
# パラメータの設定:
# 本スクリプトの「設定項目」以降にある変数で指定する
# 参考:
# https://inudaisho.hatenablog.com/entry/20120611
# https://buildersbox.corp-sansan.com/entry/2020/06/09/110000
import io
import re
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfrw import PdfReader
from pdfrw.buildxobj import pagexobj
from pdfrw.toreportlab import makerl
from reportlab.pdfgen.canvas import Canvas
# 設定項目
# 入力ファイル名
input_pdf = "input.pdf"
# 出力ファイル名
output_pdf = "out.pdf"
# 1ページ目の開始位置 (表紙分などページ開始位置をずらす)
page_shift = 0
# 目次のリーダー線 (ここで指定した文字列があるページ・行を目次として扱う)
leader_char = '....'
# 目次を解析する正規表現、対象レベル分指定
re_list = [
r'第\s+([0-9]*)\s+章\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+',
r'([0-9]+\.[0-9]+)\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+',
r'([0-9.]*)\s+(.*)\s+\.\.\.\.+\s+([0-9]+)\s+',
]
# 目次の解析
def parse_outline(input_file: str, leader_str: str, re_levels: list):
rsrcmgr = PDFResourceManager()
outlines = []
with open(input_file, "rb") as fp:
index_page = False
for pidx, page in enumerate(PDFPage.get_pages(fp)):
out_fp = io.StringIO()
device = TextConverter(
rsrcmgr,
out_fp,
laparams=LAParams(),
imagewriter=None
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
out_fp.seek(0)
page_str = out_fp.read()
if leader_str in page_str:
index_page = True
for ln in page_str.split('\n'):
for lv, re_lv in enumerate(re_levels):
m = re.match(re_lv, ln)
if m is None:
continue
title_num, title_str, page_num = m.groups()
title = "{} {}".format(title_num, title_str)
outlines.append([title, int(page_num) - 1, lv + 1])
break
elif index_page:
break
return outlines
# 目次の設定
def add_outline(input_file: str, output_file: str, outlines: list, page_shift: int = 0):
pages = PdfReader(input_file, decompress=False).pages
out_canvas = Canvas(output_file)
out_canvas.bookmarkPage("0")
out_canvas.addOutlineEntry(u"目次", "0")
for idx, page in enumerate(pages):
out_page = pagexobj(page)
out_canvas.setPageSize(tuple(out_page.BBox[2:]))
out_canvas.doForm(makerl(out_canvas, out_page))
target_outlines = [d for d in outlines if idx + page_shift == int(d[1])]
for bookmark_idx, outline_data in enumerate(target_outlines):
out_bookmark = str(idx) + "p-" + str(bookmark_idx)
out_canvas.bookmarkPage(out_bookmark)
out_canvas.addOutlineEntry(outline_data[0], out_bookmark, outline_data[2])
out_canvas.showPage()
out_canvas.showOutline()
out_canvas.save()
if __name__ == '__main__':
outline_list = parse_outline(input_pdf, leader_char, re_list)
add_outline(input_pdf, output_pdf, outline_list, page_shift)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment