Skip to content

Instantly share code, notes, and snippets.

@jpeoples
Last active October 20, 2023 21:58
Show Gist options
  • Save jpeoples/99408e1898ca125d29cc65b6031b66a6 to your computer and use it in GitHub Desktop.
Save jpeoples/99408e1898ca125d29cc65b6031b66a6 to your computer and use it in GitHub Desktop.
Extract all toplevel bookmarks in a pdf (preserving page numbers, links, and further nested bookmarks)
import fitz # PyMuPDF
import re
import sys
import argparse
import os
def chapter_titles_and_page_ranges(pdf_document: fitz.Document):
# Get the outline information (top-level bookmarks)
toc = pdf_document.get_toc(simple=False)
labels = pdf_document.get_page_labels()
no_labels=False
if len(labels) != pdf_document.page_count:
print("Cannot handle labels")
no_labels=True
titles = []
pages = []
children = []
cur_child=None
for level, title, page, dest in toc:
if level == 1:
if titles or (cur_child is not None):
children.append(cur_child if cur_child else [])
cur_child=None
titles.append(title)
pg = dest.get('page')
if pg is None:
pg = page - 1
pages.append(pg)
else:
if cur_child is None:
cur_child=[]
new_dest = dest.copy()
if 'page' in new_dest:
new_dest['page'] -= pages[-1]
entry = [level-1, title, page-pages[-1]]
if new_dest['kind'] != 4: # xref rather than GOTO type
entry += [new_dest]
cur_child.append(entry)
pages.append(pdf_document.page_count)
ranges = list(zip(pages[:-1], pages[1:]))
for title, rng, childs in zip(titles, ranges, children):
if not no_labels:
these_labels = labels[rng[0]:rng[1]]
these_labels = [t.copy() for t in these_labels]
else:
these_labels=[]
for t in these_labels:
t['startpage'] -= rng[0]
yield title, rng, these_labels, childs
def sanitize_filename(filename, replacement='_'):
# Characters not allowed in Windows file names
invalid_chars = r'<>:"/\\|?*'
# Remove or replace invalid characters with the specified replacement
sanitized_filename = re.sub(f'[{re.escape(invalid_chars)}]', replacement, filename)
# Remove leading and trailing whitespace and periods
sanitized_filename = sanitized_filename.strip(' .')
sanitized_filename = sanitized_filename.replace(" ", "_")
# Ensure the filename is not empty
if not sanitized_filename:
sanitized_filename = 'untitled'
return sanitized_filename
def extract_chapters_from_top_level_bookmarks(pdf_file, output_dir, prefix):
# Open the PDF file
pdf_document = fitz.open(pdf_file)
chap_num=1
for title, rng, labels, childs in chapter_titles_and_page_ranges(pdf_document):
chapter_pdf = fitz.open()
chapter_pdf.insert_pdf(pdf_document, from_page=rng[0], to_page=rng[1]-1)
chapter_pdf.set_page_labels(labels)
chapter_pdf.set_toc(childs)
chapter_pdf.save(os.path.join(output_dir, sanitize_filename(f'{prefix}_{chap_num:03d}_{title[:20]}.pdf')))
chapter_pdf.close()
chap_num+=1
# Close the original PDF document
pdf_document.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract chapters from a PDF based on top-level bookmarks.")
parser.add_argument("pdf_file", help="Input PDF file")
parser.add_argument('-o', "--output-dir", default="./output", help="Output directory for chapters (default: './output')")
parser.add_argument('-p', "--prefix", default="Chapter", help="Prefix for output chapter filenames (default: 'Chapter')")
args = parser.parse_args()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
extract_chapters_from_top_level_bookmarks(args.pdf_file, args.output_dir, args.prefix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment