Skip to content

Instantly share code, notes, and snippets.

@michaelstepner
Last active June 20, 2023 20:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelstepner/5a5fedf4ad816118178e2262bc635f19 to your computer and use it in GitHub Desktop.
Save michaelstepner/5a5fedf4ad816118178e2262bc635f19 to your computer and use it in GitHub Desktop.
Split a PDF at a specific bookmark; preserve bookmarks in each output PDF (using pdftk)
Zero-Clause BSD
===============
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#!/usr/bin/env python3
import argparse, re, shutil, subprocess, sys, tempfile
from typing import List, Optional, Tuple
def parse_args() -> argparse.Namespace:
""" Parse command line arguments
"""
parser = argparse.ArgumentParser(description='Split a PDF into two at a specific PDF bookmark, preserving other PDF bookmarks.')
parser = argparse.ArgumentParser(description='REQUIRES: pdftk installed and available on the command line path.')
parser.add_argument('-i', '--input', required=True, type=str, help='The input PDF file to split in two')
parser.add_argument('-o1', '--output1', required=True, type=str, help='The first output PDF file')
parser.add_argument('-o2', '--output2', required=True, type=str, help='The second output PDF file')
parser.add_argument('--split-at-bookmark', required=True, type=str, help='The name of the bookmark at which to split the PDF')
parser.add_argument('--delete-bookmark', action='store_true', help='Omit the specitfied bookmark from the output PDFs')
return parser.parse_args()
def pdf_detect_bookmark(input_file: str, bookmark_name: str) -> Tuple[str, int]:
pdf_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
if pdf_metadata.returncode == 0:
# Search for the bookmark specified by the user and the next two lines
match = re.search(rf'BookmarkTitle: {bookmark_name}\n.*\n.*', pdf_metadata.stdout)
if match:
# Split the matched string into lines
lines = match.group(0).split('\n')
# Loop through the lines and find the page number
for i in range(len(lines)):
if 'BookmarkPageNumber' in lines[i]:
split_at_page = int(lines[i].split(': ')[1])
print(f'Detected PDF bookmark "{bookmark_name}" on page {split_at_page}')
return pdf_metadata.stdout, split_at_page
else:
raise ValueError(f'Could not find PDF bookmark "{bookmark_name}" in {input_file}')
else:
print(pdf_metadata.stdout)
raise RuntimeError(f'pdftk returned non-zero exit code {pdf_metadata.returncode}')
def pdf_metadata_split_bookmarks(input_metadata: str, split_at_page: int, delete_bookmark: Optional[str] = None) -> Tuple[str, str]:
metadata_list = input_metadata.split('\n')
bookmarks1 = []
bookmarks2 = []
bookmark_block = []
in_bookmark_block = False
for line in metadata_list:
if in_bookmark_block:
if re.match('^BookmarkPageNumber: [0-9]+', line):
# Obtain page number
page_number = int(line.split(': ')[1])
# Optionally delete the specified bookmark
if page_number==split_at_page and delete_bookmark:
if bookmark_block[1] == f'BookmarkTitle: {delete_bookmark}':
bookmark_block = []
in_bookmark_block = False
continue
# Split the bookmarks in two
if page_number < split_at_page:
bookmark_block.append(line)
bookmarks1.extend(bookmark_block)
else:
bookmark_block.append(f'BookmarkPageNumber: {page_number - split_at_page + 1}')
bookmarks2.extend(bookmark_block)
# Reset the bookmark block
bookmark_block = []
in_bookmark_block = False
else:
bookmark_block.append(line)
else:
if re.match('^BookmarkBegin$', line):
bookmark_block.append(line)
in_bookmark_block = True
# Ensure minimum bookmark level is 1
bookmarks1 = pdf_metadata_bookmarklevel_normalize(bookmarks1)
bookmarks2 = pdf_metadata_bookmarklevel_normalize(bookmarks2)
return '\n'.join(bookmarks1), '\n'.join(bookmarks2)
def pdf_metadata_bookmarklevel_normalize(bookmarks: List[str]) -> List[str]:
# Calculate minimum bookmark level
min_bookmark_level = None
for line in bookmarks:
if re.match('^BookmarkLevel: [0-9]+', line):
bookmark_level = int(line.split(': ')[1])
if min_bookmark_level is None:
min_bookmark_level = bookmark_level
elif bookmark_level < min_bookmark_level:
min_bookmark_level = bookmark_level
# Ensure minimum bookmark level is 1
if min_bookmark_level:
for i in range(len(bookmarks)):
if re.match('^BookmarkLevel: [0-9]+', bookmarks[i]):
bookmark_level = int(bookmarks[i].split(': ')[1])
bookmarks[i] = f'BookmarkLevel: {bookmark_level - min_bookmark_level + 1}'
return bookmarks
def pdf_split_at_page(input_file: str, output_file1: str, output_file2: str, split_at_page: int) -> None:
output_pdf_1 = subprocess.run(['pdftk', input_file, 'cat', f'1-{split_at_page - 1}', 'output', output_file1], capture_output=True, text=True)
if output_pdf_1.returncode == 0:
print(f'Created {output_file1} containing pages 1-{split_at_page - 1} of {input_file}')
else:
print(output_pdf_1.stdout)
raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_1.returncode}')
output_pdf_2 = subprocess.run(['pdftk', input_file, 'cat', f'{split_at_page}-end', 'output', output_file2], capture_output=True, text=True)
if output_pdf_2.returncode == 0:
print(f'Created {output_file2} containing pages {split_at_page}-end of {input_file}')
else:
print(output_pdf_2.stdout)
raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_2.returncode}')
def pdf_insert_bookmarks(input_file: str, bookmarks: str) -> None:
# Obtain metadata for the PDF to be modified
input_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
if input_metadata.returncode == 0:
# Verify that there are no bookmarks in the input PDF
if 'BookmarkBegin' in input_metadata.stdout:
raise ValueError(f'Input PDF {input_file} already contains bookmarks')
# Split the metadata into before/after bookmarks
input_metadata_parts = [i.rstrip('\n') for i in input_metadata.stdout.split('PageMediaBegin', 1)]
input_metadata_parts[1] = 'PageMediaBegin' + input_metadata_parts[1]
# Insert the bookmarks into the metadata
with tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_metadata, tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_pdf:
temp_metadata.write(input_metadata_parts[0])
temp_metadata.write('\n')
temp_metadata.write(bookmarks)
temp_metadata.write('\n')
temp_metadata.write(input_metadata_parts[1])
temp_metadata.seek(0)
updated_pdf = subprocess.run(['pdftk', input_file, 'update_info', temp_metadata.name, 'output', temp_pdf.name], capture_output=True, text=True)
if updated_pdf.returncode == 0:
shutil.copy2(temp_pdf.name, input_file)
print(f'Inserted PDF bookmarks into {input_file}')
else:
print(updated_pdf.stdout)
raise RuntimeError(f'pdftk returned non-zero exit code {updated_pdf.returncode}')
else:
print(input_metadata.stdout)
raise RuntimeError(f'pdftk returned non-zero exit code {input_metadata.returncode}')
def main() -> int:
# Parse command line arguments
args = parse_args()
# Obtain metadata for the input PDF
input_pdf_metadata, split_at_page = pdf_detect_bookmark(input_file=args.input, bookmark_name=args.split_at_bookmark)
# Separate the bookmarks for the first and second output PDFs
if args.delete_bookmark:
bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page, delete_bookmark=args.split_at_bookmark)
else:
bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page)
# Split the PDF into two files
pdf_split_at_page(input_file=args.input, output_file1=args.output1, output_file2=args.output2, split_at_page=split_at_page)
# Insert the bookmarks into the output PDFs
pdf_insert_bookmarks(input_file=args.output1, bookmarks=bookmarks1)
pdf_insert_bookmarks(input_file=args.output2, bookmarks=bookmarks2)
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment