Skip to content

Instantly share code, notes, and snippets.

@rwirth
Created May 27, 2019 21:19
Show Gist options
  • Save rwirth/82bdd2925dc449666f168b0e4d175e57 to your computer and use it in GitHub Desktop.
Save rwirth/82bdd2925dc449666f168b0e4d175e57 to your computer and use it in GitHub Desktop.
Script to add bookmarks to a PDF file
#!/usr/bin/env python3
import collections
import io
import re
import PyPDF2.pdf as pdf
ChapterTreeNode = collections.namedtuple('ChapterTreeNode', 'name page children')
def read_bookmarks(bmfile, offset=1, strip=False):
with io.open(bmfile, encoding='utf8') as f:
bookmarks = []
chapters = ChapterTreeNode(None, None, {})
for line in f:
fields = line.split()
if len(fields) == 0:
continue
# Correction factor because PDF page indexing starts at zero and the offset is for page 1
page = offset + int(fields[-1]) - 2
# if first field looks like a chapter
if re.match(r'(\d+\.)*\d+', fields[0]) is not None:
if strip:
name = ' '.join(fields[1:-1])
else:
name = ' '.join(fields[:-1])
entry = ChapterTreeNode(name, page, {})
chapternum = fields[0].split('.')
parent = chapters
for num in chapternum[:-1]:
try:
parent = parent.children[int(num)]
except KeyError:
print('Offending line: {}'.format(line))
raise
parent.children[int(chapternum[-1])] = entry
if len(chapternum) == 1:
bookmarks.append(entry)
elif fields[0] == '>>':
if fields[1] == 'shift':
offset += int(fields[2])
else:
raise RuntimeError('unknown command {}'.format(fields[1]))
else:
name = ' '.join(fields[:-1])
entry = ChapterTreeNode(name, page, {})
bookmarks.append(entry)
return bookmarks
def add_bookmarks(writer, bookmarks):
def _add(ctn, parent=None):
this = writer.addBookmark(ctn.name, ctn.page, parent=parent, italic=(parent is None and len(ctn.children) == 0))
for child in sorted(ctn.children):
_add(ctn.children[child], parent=this)
for rootbm in bookmarks:
_add(rootbm, parent=None)
def _main():
import argparse
import sys
parser = argparse.ArgumentParser(description='Add bookmarks to a PDF file.')
parser.add_argument('-f', '--offset', type=int, default=1,
help='Page number of page 1 in the file.')
parser.add_argument('-o', '--outfile',
help='Output file name. Writes to stdout if none given.')
parser.add_argument('-s', '--strip', action='store_true',
help='Strip chapter number from entry.')
parser.add_argument('pdf', help='Input PDF file.')
parser.add_argument('bookmarkfile', help='Bookmark file. Lines consist of '
'optionally a chapter number x.y.z, followed by the title '
'and the page number. Fields are separated by whitespace, '
'all runs of whitespace are replaced by single spaces in '
'the bookmarks. Upper levels must precede their children. '
'Chapterless entries can only be at the top level. '
'Encoding is UTF-8.')
args = parser.parse_args()
bookmarks = read_bookmarks(args.bookmarkfile, args.offset, args.strip)
pdffile = pdf.PdfFileReader(args.pdf)
writer = pdf.PdfFileWriter()
writer.cloneDocumentFromReader(pdffile)
# fake write to get a proper PDF
writer.write(io.BytesIO())
add_bookmarks(writer, bookmarks)
output = io.BytesIO()
writer.write(output)
if args.outfile is not None:
with open(args.outfile, 'wb') as f:
f.write(output.getvalue())
else:
sys.stdout.write(output.getvalue())
output.close()
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment