Skip to content

Instantly share code, notes, and snippets.

@kspeeckaert
Last active September 16, 2019 11:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kspeeckaert/b51e029af2f92e05b4e7ca1700d341ad to your computer and use it in GitHub Desktop.
Save kspeeckaert/b51e029af2f92e05b4e7ca1700d341ad to your computer and use it in GitHub Desktop.
Extract the table of contents from a PDF file and save it as an OPML, e.g. for import into a mind map
# Requirements
# yattag==1.12.2
# PyPDF2==1.26.0
# Tested with Python 3.7.4 on macOS
import sys
from pathlib import Path
from PyPDF2 import PdfFileReader
from yattag import Doc
from yattag import indent
def dump_outline(doc, outline):
# This is a rather convoluted function, due to the way
# PyPDF2 returns the PDF outline.
# If the parent node is located at position i, the child
# nodes are stored as a list in position i+1
curr_idx = 0
while curr_idx < len(outline):
try:
if isinstance(outline[curr_idx +1], list):
with doc.tag('outline', text=outline[curr_idx].title):
dump_outline(doc, outline[curr_idx+1])
# Jump over child nodes
curr_idx += 2
continue
except IndexError:
pass
# Use a self-closing tag, there are no child nodes
doc.stag('outline', text=outline[curr_idx].title)
curr_idx += 1
def main(pdf_filename):
try:
input_file = Path(pdf_filename)
# The OPML file is saved in the same location as the PDF file
output_file = input_file.with_suffix('.opml')
pdf_doc = PdfFileReader(open(input_file, 'rb'))
doc, tag, text, line = Doc().ttl()
doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
with tag('opml', version='1.0'):
with tag('head'):
# If the document has no title, we'll use the
# PDF filename (without extension)
title = pdf_doc.getDocumentInfo().title
if title is None:
title = input_file.stem
line('title', title)
with tag('body'):
dump_outline(doc, pdf_doc.outlines)
with open(output_file, 'w') as f:
f.write(indent(doc.getvalue()))
except Exception as e:
print(f'Error: {e!r}')
if __name__ == '__main__':
# The PDF filename (incl path) is passed as an argument on the command line
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment