Last active
June 20, 2023 20:19
-
-
Save michaelstepner/5a5fedf4ad816118178e2262bc635f19 to your computer and use it in GitHub Desktop.
Split a PDF at a specific bookmark; preserve bookmarks in each output PDF (using pdftk)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Zero-Clause BSD | |
=============== | |
Permission to use, copy, modify, and/or distribute this software for | |
any purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL | |
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES | |
OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE | |
FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY | |
DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN | |
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT | |
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse, re, shutil, subprocess, sys, tempfile | |
from typing import List, Optional, Tuple | |
def parse_args() -> argparse.Namespace: | |
""" Parse command line arguments | |
""" | |
parser = argparse.ArgumentParser(description='Split a PDF into two at a specific PDF bookmark, preserving other PDF bookmarks.') | |
parser = argparse.ArgumentParser(description='REQUIRES: pdftk installed and available on the command line path.') | |
parser.add_argument('-i', '--input', required=True, type=str, help='The input PDF file to split in two') | |
parser.add_argument('-o1', '--output1', required=True, type=str, help='The first output PDF file') | |
parser.add_argument('-o2', '--output2', required=True, type=str, help='The second output PDF file') | |
parser.add_argument('--split-at-bookmark', required=True, type=str, help='The name of the bookmark at which to split the PDF') | |
parser.add_argument('--delete-bookmark', action='store_true', help='Omit the specitfied bookmark from the output PDFs') | |
return parser.parse_args() | |
def pdf_detect_bookmark(input_file: str, bookmark_name: str) -> Tuple[str, int]: | |
pdf_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True) | |
if pdf_metadata.returncode == 0: | |
# Search for the bookmark specified by the user and the next two lines | |
match = re.search(rf'BookmarkTitle: {bookmark_name}\n.*\n.*', pdf_metadata.stdout) | |
if match: | |
# Split the matched string into lines | |
lines = match.group(0).split('\n') | |
# Loop through the lines and find the page number | |
for i in range(len(lines)): | |
if 'BookmarkPageNumber' in lines[i]: | |
split_at_page = int(lines[i].split(': ')[1]) | |
print(f'Detected PDF bookmark "{bookmark_name}" on page {split_at_page}') | |
return pdf_metadata.stdout, split_at_page | |
else: | |
raise ValueError(f'Could not find PDF bookmark "{bookmark_name}" in {input_file}') | |
else: | |
print(pdf_metadata.stdout) | |
raise RuntimeError(f'pdftk returned non-zero exit code {pdf_metadata.returncode}') | |
def pdf_metadata_split_bookmarks(input_metadata: str, split_at_page: int, delete_bookmark: Optional[str] = None) -> Tuple[str, str]: | |
metadata_list = input_metadata.split('\n') | |
bookmarks1 = [] | |
bookmarks2 = [] | |
bookmark_block = [] | |
in_bookmark_block = False | |
for line in metadata_list: | |
if in_bookmark_block: | |
if re.match('^BookmarkPageNumber: [0-9]+', line): | |
# Obtain page number | |
page_number = int(line.split(': ')[1]) | |
# Optionally delete the specified bookmark | |
if page_number==split_at_page and delete_bookmark: | |
if bookmark_block[1] == f'BookmarkTitle: {delete_bookmark}': | |
bookmark_block = [] | |
in_bookmark_block = False | |
continue | |
# Split the bookmarks in two | |
if page_number < split_at_page: | |
bookmark_block.append(line) | |
bookmarks1.extend(bookmark_block) | |
else: | |
bookmark_block.append(f'BookmarkPageNumber: {page_number - split_at_page + 1}') | |
bookmarks2.extend(bookmark_block) | |
# Reset the bookmark block | |
bookmark_block = [] | |
in_bookmark_block = False | |
else: | |
bookmark_block.append(line) | |
else: | |
if re.match('^BookmarkBegin$', line): | |
bookmark_block.append(line) | |
in_bookmark_block = True | |
# Ensure minimum bookmark level is 1 | |
bookmarks1 = pdf_metadata_bookmarklevel_normalize(bookmarks1) | |
bookmarks2 = pdf_metadata_bookmarklevel_normalize(bookmarks2) | |
return '\n'.join(bookmarks1), '\n'.join(bookmarks2) | |
def pdf_metadata_bookmarklevel_normalize(bookmarks: List[str]) -> List[str]: | |
# Calculate minimum bookmark level | |
min_bookmark_level = None | |
for line in bookmarks: | |
if re.match('^BookmarkLevel: [0-9]+', line): | |
bookmark_level = int(line.split(': ')[1]) | |
if min_bookmark_level is None: | |
min_bookmark_level = bookmark_level | |
elif bookmark_level < min_bookmark_level: | |
min_bookmark_level = bookmark_level | |
# Ensure minimum bookmark level is 1 | |
if min_bookmark_level: | |
for i in range(len(bookmarks)): | |
if re.match('^BookmarkLevel: [0-9]+', bookmarks[i]): | |
bookmark_level = int(bookmarks[i].split(': ')[1]) | |
bookmarks[i] = f'BookmarkLevel: {bookmark_level - min_bookmark_level + 1}' | |
return bookmarks | |
def pdf_split_at_page(input_file: str, output_file1: str, output_file2: str, split_at_page: int) -> None: | |
output_pdf_1 = subprocess.run(['pdftk', input_file, 'cat', f'1-{split_at_page - 1}', 'output', output_file1], capture_output=True, text=True) | |
if output_pdf_1.returncode == 0: | |
print(f'Created {output_file1} containing pages 1-{split_at_page - 1} of {input_file}') | |
else: | |
print(output_pdf_1.stdout) | |
raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_1.returncode}') | |
output_pdf_2 = subprocess.run(['pdftk', input_file, 'cat', f'{split_at_page}-end', 'output', output_file2], capture_output=True, text=True) | |
if output_pdf_2.returncode == 0: | |
print(f'Created {output_file2} containing pages {split_at_page}-end of {input_file}') | |
else: | |
print(output_pdf_2.stdout) | |
raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_2.returncode}') | |
def pdf_insert_bookmarks(input_file: str, bookmarks: str) -> None: | |
# Obtain metadata for the PDF to be modified | |
input_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True) | |
if input_metadata.returncode == 0: | |
# Verify that there are no bookmarks in the input PDF | |
if 'BookmarkBegin' in input_metadata.stdout: | |
raise ValueError(f'Input PDF {input_file} already contains bookmarks') | |
# Split the metadata into before/after bookmarks | |
input_metadata_parts = [i.rstrip('\n') for i in input_metadata.stdout.split('PageMediaBegin', 1)] | |
input_metadata_parts[1] = 'PageMediaBegin' + input_metadata_parts[1] | |
# Insert the bookmarks into the metadata | |
with tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_metadata, tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_pdf: | |
temp_metadata.write(input_metadata_parts[0]) | |
temp_metadata.write('\n') | |
temp_metadata.write(bookmarks) | |
temp_metadata.write('\n') | |
temp_metadata.write(input_metadata_parts[1]) | |
temp_metadata.seek(0) | |
updated_pdf = subprocess.run(['pdftk', input_file, 'update_info', temp_metadata.name, 'output', temp_pdf.name], capture_output=True, text=True) | |
if updated_pdf.returncode == 0: | |
shutil.copy2(temp_pdf.name, input_file) | |
print(f'Inserted PDF bookmarks into {input_file}') | |
else: | |
print(updated_pdf.stdout) | |
raise RuntimeError(f'pdftk returned non-zero exit code {updated_pdf.returncode}') | |
else: | |
print(input_metadata.stdout) | |
raise RuntimeError(f'pdftk returned non-zero exit code {input_metadata.returncode}') | |
def main() -> int: | |
# Parse command line arguments | |
args = parse_args() | |
# Obtain metadata for the input PDF | |
input_pdf_metadata, split_at_page = pdf_detect_bookmark(input_file=args.input, bookmark_name=args.split_at_bookmark) | |
# Separate the bookmarks for the first and second output PDFs | |
if args.delete_bookmark: | |
bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page, delete_bookmark=args.split_at_bookmark) | |
else: | |
bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page) | |
# Split the PDF into two files | |
pdf_split_at_page(input_file=args.input, output_file1=args.output1, output_file2=args.output2, split_at_page=split_at_page) | |
# Insert the bookmarks into the output PDFs | |
pdf_insert_bookmarks(input_file=args.output1, bookmarks=bookmarks1) | |
pdf_insert_bookmarks(input_file=args.output2, bookmarks=bookmarks2) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment