Last active
October 20, 2023 21:58
-
-
Save jpeoples/99408e1898ca125d29cc65b6031b66a6 to your computer and use it in GitHub Desktop.
Extract all toplevel bookmarks in a pdf (preserving page numbers, links, and further nested bookmarks)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz # PyMuPDF | |
import re | |
import sys | |
import argparse | |
import os | |
def chapter_titles_and_page_ranges(pdf_document: fitz.Document): | |
# Get the outline information (top-level bookmarks) | |
toc = pdf_document.get_toc(simple=False) | |
labels = pdf_document.get_page_labels() | |
no_labels=False | |
if len(labels) != pdf_document.page_count: | |
print("Cannot handle labels") | |
no_labels=True | |
titles = [] | |
pages = [] | |
children = [] | |
cur_child=None | |
for level, title, page, dest in toc: | |
if level == 1: | |
if titles or (cur_child is not None): | |
children.append(cur_child if cur_child else []) | |
cur_child=None | |
titles.append(title) | |
pg = dest.get('page') | |
if pg is None: | |
pg = page - 1 | |
pages.append(pg) | |
else: | |
if cur_child is None: | |
cur_child=[] | |
new_dest = dest.copy() | |
if 'page' in new_dest: | |
new_dest['page'] -= pages[-1] | |
entry = [level-1, title, page-pages[-1]] | |
if new_dest['kind'] != 4: # xref rather than GOTO type | |
entry += [new_dest] | |
cur_child.append(entry) | |
pages.append(pdf_document.page_count) | |
ranges = list(zip(pages[:-1], pages[1:])) | |
for title, rng, childs in zip(titles, ranges, children): | |
if not no_labels: | |
these_labels = labels[rng[0]:rng[1]] | |
these_labels = [t.copy() for t in these_labels] | |
else: | |
these_labels=[] | |
for t in these_labels: | |
t['startpage'] -= rng[0] | |
yield title, rng, these_labels, childs | |
def sanitize_filename(filename, replacement='_'): | |
# Characters not allowed in Windows file names | |
invalid_chars = r'<>:"/\\|?*' | |
# Remove or replace invalid characters with the specified replacement | |
sanitized_filename = re.sub(f'[{re.escape(invalid_chars)}]', replacement, filename) | |
# Remove leading and trailing whitespace and periods | |
sanitized_filename = sanitized_filename.strip(' .') | |
sanitized_filename = sanitized_filename.replace(" ", "_") | |
# Ensure the filename is not empty | |
if not sanitized_filename: | |
sanitized_filename = 'untitled' | |
return sanitized_filename | |
def extract_chapters_from_top_level_bookmarks(pdf_file, output_dir, prefix): | |
# Open the PDF file | |
pdf_document = fitz.open(pdf_file) | |
chap_num=1 | |
for title, rng, labels, childs in chapter_titles_and_page_ranges(pdf_document): | |
chapter_pdf = fitz.open() | |
chapter_pdf.insert_pdf(pdf_document, from_page=rng[0], to_page=rng[1]-1) | |
chapter_pdf.set_page_labels(labels) | |
chapter_pdf.set_toc(childs) | |
chapter_pdf.save(os.path.join(output_dir, sanitize_filename(f'{prefix}_{chap_num:03d}_{title[:20]}.pdf'))) | |
chapter_pdf.close() | |
chap_num+=1 | |
# Close the original PDF document | |
pdf_document.close() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Extract chapters from a PDF based on top-level bookmarks.") | |
parser.add_argument("pdf_file", help="Input PDF file") | |
parser.add_argument('-o', "--output-dir", default="./output", help="Output directory for chapters (default: './output')") | |
parser.add_argument('-p', "--prefix", default="Chapter", help="Prefix for output chapter filenames (default: 'Chapter')") | |
args = parser.parse_args() | |
if not os.path.exists(args.output_dir): | |
os.makedirs(args.output_dir) | |
extract_chapters_from_top_level_bookmarks(args.pdf_file, args.output_dir, args.prefix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment