Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
pdftkコマンドの`dump_data_utf8`オプションの出力ファイルから目次(Bookmark)の情報を抽出してページ範囲を出力するスクリプト
#! /usr/local/bin/python3
# coding: utf-8
import re
import pathlib
def parse_page_range(filename, output_path=pathlib.Path('.'), pdf_filename=None):
'''pdftkコマンドの`dump_data_utf8`オプションの出力ファイルから目次(Bookmark)の情報を抽出してページ範囲を出力する'''
fp = pathlib.Path(filename).open('r') # pdftkコマンドの出力ファイルを読み込む
entry = { 'title': "", 'page_no': None}
last_page = 0
bk_list = []
for line in fp:
if line.startswith("NumberOfPages"):
last_page = int(line.rstrip("\n").split(': ')[1])
if line.startswith("PageMediaBegin"):
bk_list.append(entry)
break
if not line.startswith("Bookmark") :
continue
else:
if line.startswith("BookmarkTitle"):
entry['title'] = line.rstrip("\n").split(': ')[1]
if line.startswith("BookmarkPageNumber") :
entry['page_no'] = int(line.rstrip("\n").split(': ')[1])
if line.startswith("BookmarkBegin") :
if len(entry['title']) > 0:
bk_list.append(entry)
entry = { 'title': "", 'page_no': None}
if line.startswith("PageMediaBegin"):
bk_list.append(entry)
break
fp.close()
filtered_list = [ e for e in bk_list if e['page_no'] > 0 ]
sorted_list = filtered_list
sorted_list.sort(key=lambda x: x['page_no'])
page_index_list = []
for i, e in enumerate(sorted_list):
end_page = last_page
try:
# print(i)
n = sorted_list[i+1]
next_article_page = n['page_no']
end_page = next_article_page - 1
except:
pass
index_entry = [pdf_filename, e['title'], str(e['page_no']),str(end_page)]
#print(index_entry)
page_index_list.append(index_entry)
#print(pdf_filename, e['title'], e['page_no'],end_page, sep="\t")
#return bk_list
tsvname = 'index_' + pdf_filename.split('.')[0] + '.tsv'
output = output_path / tsvname
with output.open(mode='wt') as fp :
for n in page_index_list :
temp = "\t".join(n) + "\n"
fp.write(temp)
return 0
if __name__ == '__main__' :
pl = pathlib.Path('./data')
output_dir = pl / 'page_index'
output_dir.mkdir(parents=True, exist_ok=True)
dump_files = [p for p in pl.glob('output_log/dump_*.txt')]
for f in dump_files:
filename = str(f)
pdf_filename = re.search('dump_(.+).txt', filename)[1] + '.pdf'
parse_page_range(filename, output_path=output_dir, pdf_filename=pdf_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment