Skip to content

Instantly share code, notes, and snippets.

@karlrwjohnson
Created April 12, 2021 00:22
Show Gist options
  • Save karlrwjohnson/36db59ab8f4506d770b521eb8f71b233 to your computer and use it in GitHub Desktop.
Save karlrwjohnson/36db59ab8f4506d770b521eb8f71b233 to your computer and use it in GitHub Desktop.
Script to edit a PDF file's bookmarks using a text editor (uses pdftk-java and Python3)
#!/usr/bin/python3
##
## PDF Bookmark Patcher script
##
## This script allows you to use a regular text editor to edit the bookmarks
## in a PDF file. It is a wrapper around another tool called PDFtk-java.
##
## Usage:
## 1. < replace somefile.pdf with the name of your file >
## 2. python3 ../extract_bookmarks.py somefile.pdf --export-text bookmarks.txt
## 3. < edit bookmarks.txt >
## 4. python3 ../extract_bookmarks.py somefile.pdf --import-text bookmarks.txt
## 5. < somefile.bookmarked.pdf is a copy of the file with updated bookmarks >
##
## Requires:
## - PDFtk-java (https://gitlab.com/pdftk-java/pdftk, https://linuxhint.com/install_pdftk_ubuntu/)
## - Make sure that `pdftk` is present in your PATH
## - Python 3
##
import html
import re
import sys
from argparse import ArgumentParser
from bs4 import BeautifulSoup
from subprocess import Popen, PIPE
from logging import getLogger, DEBUG, basicConfig
from textwrap import dedent
from typing import List, NamedTuple, Union
logger = getLogger()
basicConfig(format='%(levelname)s - %(message)s')
class PdfBookmark(NamedTuple):
title: str
level: int
page_number: int
class BookmarkTreeNode(NamedTuple):
title: str
level: int
page_number: int
children: List['BookmarkTreeNode']
class TextTreeNode(NamedTuple):
content: str
indent: str
line_number: int
children: List['TextTreeNode']
def main():
parser = ArgumentParser(description='Use pdftk to edit bookmarks on a PDF')
parser.add_argument('pdf_file', help='PDF file to process')
parser.add_argument('--export-text', help='Export ')
parser.add_argument('--import-text', help='HTML file of bookmarks')
parser.add_argument('--output-pdf', help='PDF file to output to')
parser.add_argument('--verbose', help='Enable verbose logging', action='store_true')
args = parser.parse_args()
# if args.verbose:
# logger.setLevel(DEBUG)
logger.setLevel(DEBUG)
if args.export_text:
export_text(args.pdf_file, args.export_text)
elif args.import_text:
import_text(args.pdf_file, args.import_text, args.output_pdf)
def export_text(pdf_filename:str, bookmark_filename: str):
with open(bookmark_filename, 'w') as outfile:
data_lines: List[str] = dump_pdf_data(pdf_filename)
bookmarks: List[PdfBookmark] = parse_bookmarks_from_pdf_data(data_lines)
tree: List[BookmarkTreeNode] = build_bookmark_tree(bookmarks)
html: str = export_bookmarks_text(tree, pdf_filename, bookmark_filename)
outfile.write(html)
def import_text(pdf_filename:str, bookmark_filename: str, output_filename: Union[str, None]):
if not output_filename:
output_filename = re.sub(r'\.pdf', '.bookmarked.pdf', pdf_filename)
logger.info('Automatically generating output filename %s', output_filename)
with open(bookmark_filename) as infile:
file_lines: List[str] = infile.read().split('\n')
text_tree: List[TextTreeNode] = import_indented_text(file_lines)
bookmarks: List[PdfBookmark] = build_pdf_bookmark_list(text_tree)
logger.info(f'Loaded {len(bookmarks)} bookmarks with {sum(1 for x in bookmarks if x.page_number != 0)} page numbers')
original_data_lines: List[str] = dump_pdf_data(pdf_filename)
patched_data_lines: List[str] = patch_bookmarks_into_pdf_data(original_data_lines, bookmarks)
update_pdf_data(pdf_filename, output_filename, patched_data_lines)
def dump_pdf_data(filename: str) -> List[str]:
command = ['pdftk', filename, 'dump_data_utf8']
logger.info(f'Running command: {command!r}')
popen = Popen(command, stdout=PIPE)
stdout_data, stderr_data = popen.communicate()
assert popen.returncode == 0, f'{command} failed (exit code {popen.returncode})'
ret = stdout_data.decode().split('\n')
logger.debug(f'Command returned {len(ret)} lines')
return ret
def update_pdf_data(input_filename: str, output_filename: str, contents: List[str]) -> None:
command = ['pdftk', input_filename, 'update_info_utf8', '-', 'output', output_filename, 'verbose']
content_bytes = '\n'.join(contents).encode('utf-8')
logger.info(f'Running command with {len(contents)} lines of input: {command!r}')
popen = Popen(command, stdin=PIPE, stdout=PIPE)
stdout_data, stderr_data = popen.communicate(content_bytes)
print(stdout_data.decode())
assert popen.returncode == 0, f'{command} failed (exit code {popen.returncode})'
def parse_bookmarks_from_pdf_data(lines: List[str]) -> List[PdfBookmark]:
ret: List[PdfBookmark] = []
line_number = 0
current_bookmark_fields: Dict = {}
def flush():
# Extract current state and perform reset immediately
# so we can return early on error
flushed_state = {**current_bookmark_fields}
current_bookmark_fields.clear()
# Initial record will cause a flush. This should not be an error.
if not flushed_state.keys():
return
# Do not create bookmark if necessary keys are missing, but keep parsing bookmarks
expected_keys = PdfBookmark._fields
missing_keys = expected_keys - flushed_state.keys()
if missing_keys:
logger.warn('Missing keys %s when flushing bookmark on line %s', missing_keys, line_number)
return
# Flag when extra keys exist
extra_keys = flushed_state.keys() - expected_keys
if extra_keys:
logger.warn('Extra keys %s when flushing bookmark on line %s', extra_keys, line_number)
# Do not abort
bookmark = PdfBookmark(**{key: flushed_state[key] for key in expected_keys})
ret.append(bookmark)
for line in lines:
line_number += 1
prop, value, *_ = line.split(': ', maxsplit=1) + [None]
if prop == 'BookmarkBegin':
flush()
elif prop == 'BookmarkTitle':
if value is None:
logger.warn('Missing value on line %s', line_number)
continue
current_bookmark_fields['title'] = html.unescape(value)
elif prop == 'BookmarkLevel':
if value is None:
logger.warn('Missing value on line %s', line_number)
continue
try:
current_bookmark_fields['level'] = int(value)
except ValueError:
logger.warn('Cannot parse %s as an integer on line %s', value, line_number)
elif prop == 'BookmarkPageNumber':
if value is None:
logger.warn('Missing value on line %s', line_number)
continue
try:
current_bookmark_fields['page_number'] = int(value)
except ValueError:
logger.warn('Cannot parse %s as an integer on line %s', value, line_number)
flush()
return ret
def patch_bookmarks_into_pdf_data(pdftk_data: List[str], bookmarks: List[PdfBookmark]) -> List[str]:
ret: List[str] = []
data_written = False
bookmark_lines = [
line
for bookmark in bookmarks
for line in [
'BookmarkBegin',
f'BookmarkTitle: {html.escape(bookmark.title)}',
f'BookmarkLevel: {bookmark.level}',
f'BookmarkPageNumber: {bookmark.page_number}',
]
]
for line in pdftk_data:
if line.startswith('BookmarkBegin'):
if not data_written:
data_written = True
ret += bookmark_lines
continue
elif line.startswith('BookmarkTitle') or line.startswith('BookmarkLevel') or line.startswith('BookmarkPageNumber'):
continue
ret.append(line)
if not data_written:
data_written = True
ret += bookmark_lines
return ret
def build_bookmark_tree(bookmarks: List[PdfBookmark]) -> List[BookmarkTreeNode]:
logger.debug('converting list of %s bookmarks into a tree based on their "level" properties', len(bookmarks))
# The value that we'll return -- it's a list of bookmark nodes with level=1
# Elements will be added to this via ancestor_child_lists
root_node_list: List[BookmarkTreeNode] = [];
# As we build the tree, use a stack to track the current node's ancestors
# -- Or rather, the ancestors' lists of child nodes.
# The first list is the list of "root nodes", who have level=1
# The second list is the list of children of the current level=1 node,
# for whome level=2 and so forth.
ancestor_child_lists: List[List[BookmarkTreeNode]] = [root_node_list]
def get_current_level():
return len(ancestor_child_lists)
def push_node(node: BookmarkTreeNode):
# Add the node to its direct ancestor's list of children
ancestor_child_lists[-1].append(node)
# Push its child list onto the stack of ancestors, so further calls to
# push_node will add children to this node (see above)
ancestor_child_lists.append(node.children)
def pop_node():
if get_current_level() <= 1:
raise RuntimeError("Bookmark level should not go below 1")
ancestor_child_lists.pop()
for bookmark in bookmarks:
# The algorithm is centered around "add a node to its parent"
# For that to work, the parent must be exactly one level higher
# than the child.
# If the current node is higher than the stack indicates,
# peel back nodes until the stack is pointing to the appropriate parent
while get_current_level() > bookmark.level:
pop_node()
# If the next bookmark jumped down a level too far (e.g. level 3 to level 5),
# add intermediate nodes to fill the gap
while get_current_level() < bookmark.level:
placeholder_node = BookmarkTreeNode(
title="<placeholder>",
level=bookmark.level,
page_number=0,
children=[]
)
push_node(placeholder_node)
# Add the current node
node = BookmarkTreeNode(
title=bookmark.title,
level=bookmark.level,
page_number=bookmark.page_number,
children=[]
)
push_node(node)
logger.debug('finished generating bookmark tree')
return root_node_list
def export_bookmarks_text(bookmark_tree: List[BookmarkTreeNode], pdf_filename: str, bookmark_filename: str) -> str:
logger.debug('generating YML file from tree')
instructions = dedent(f"""\
# Bookmarks extracted from {pdf_filename}
#
# Instructions:
#
# Using a text editor, update the following list of bookmarks.
# Each bookmark should appear on its own line, and it should follow the format
# <page number>. <title>
# To make some bookmarks appear as children of others, use spaces or tabs to indent.
#
# Blank lines and lines starting with a "#" (like this one) are comments and will not be interpreted as a bookmark
#
# Then, import them back into the PDF file using the command:
# python {sys.argv[0]} {pdf_filename} --import-from {bookmark_filename}
#
# E.g.:
#
# 1. Introduction
# 3. Episode IV: A New Hope
# 4. Scene 1: Vader captures Leia
# 10. Scene 2: Luke on Tattoine
# 105. Episode V: The Empire Strikes Back
# 192. Episode VI: Return of the Jedi
""")
lines = [instructions, '']
def print_tree_node(bookmark_tree: List[BookmarkTreeNode], indent: str = ''):
next_indent = indent + '\t'
for current in bookmark_tree:
lines.append(f'{indent}{current.page_number}. {current.title}')
print_tree_node(current.children, next_indent)
print_tree_node(bookmark_tree)
ret = '\n'.join(lines)
logger.debug('finished generating text file')
return ret
def import_indented_text(file_lines: str) -> List[TextTreeNode]:
# The value that we'll return -- it's a list of unindented file lines
# Elements will be added to this via ancestor_child_lists
root_node_list: List[TextTreeNode] = []
# As we build the tree, use a stack to track the current node's ancestors
# -- Or rather, the ancestors' lists of child nodes.
# The first list is the list of "root nodes", who have level=1
# The second list is the list of children of the current level=1 node,
# for whome level=2 and so forth.
ancestor_child_lists: List[List[TextTreeNode]] = [root_node_list]
def get_current_level():
return len(ancestor_child_lists)
def push_node(node: TextTreeNode):
# Add the node to its direct ancestor's list of children
ancestor_child_lists[-1].append(node)
# Push its child list onto the stack of ancestors, so further calls to
# push_node will add children to this node (see above)
ancestor_child_lists.append(node.children)
def pop_node():
if get_current_level() <= 1:
raise RuntimeError("Bookmark level should not go below 1")
ancestor_child_lists.pop()
for line_number, line in enumerate(file_lines):
# Skip blank lines and comments
if re.match(r'^\s*#', line) or re.match(r'^\s*$', line):
continue
indent, content = re.match(r'^(\s*)(.+)$', line).groups()
# Find the most-direct ancestor.
# If the node at the top of the ancestor stack has an indent that includes
# the current line's indent, then the current line must be either a peer
# or an aunt/uncle of that ancestor. So pop nodes off the stack until you
# find one with a smaller indent.
while len(ancestor_child_lists) > 1 and ancestor_child_lists[-2][-1].indent.startswith(indent):
pop_node()
# The child's indent must be a superset of the ancestor's indent
# If it's not, then someone mixed tabs and spaces and it's impossible
# to parse the file.
if len(ancestor_child_lists) > 1 and not indent.startswith(ancestor_child_lists[-2][-1].indent):
raise RuntimeError(f"Inconsistent indentation whitespace at line {line_number}. Make sure you're using either tabs or spaces, not both!")
node = TextTreeNode(
content=content,
indent=indent,
line_number=line_number,
children=[]
)
push_node(node)
return root_node_list
def build_pdf_bookmark_list(text_tree: List[TextTreeNode], level=1) -> List[PdfBookmark]:
ret: List[PdfBookmark] = []
for line in text_tree:
page_number, title = re.match(r'^\s*(\d+)?(?:\.?)\s*(.+)$', line.content).groups()
if page_number is None:
logger.warn('No page number for bookmark named %s on line %s', title, line.line_number, level)
page_number = 0
ret.append(PdfBookmark(title=title, level=level, page_number=int(page_number)))
ret += build_pdf_bookmark_list(line.children, level=(level + 1))
return ret
if __name__ == '__main__':
exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment