michaelstepner/LICENSE

## LICENSE
Zero-Clause BSD
===============

Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted.

THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

## split_pdf.py
#!/usr/bin/env python3
import argparse, re, shutil, subprocess, sys, tempfile
from typing import List, Optional, Tuple

def parse_args() -> argparse.Namespace:
    """ Parse command line arguments
    """
    parser = argparse.ArgumentParser(description='Split a PDF into two at a specific PDF bookmark, preserving other PDF bookmarks.')
    parser = argparse.ArgumentParser(description='REQUIRES: pdftk installed and available on the command line path.')
    parser.add_argument('-i', '--input', required=True, type=str, help='The input PDF file to split in two')
    parser.add_argument('-o1', '--output1', required=True, type=str, help='The first output PDF file')
    parser.add_argument('-o2', '--output2', required=True, type=str, help='The second output PDF file')
    parser.add_argument('--split-at-bookmark', required=True, type=str, help='The name of the bookmark at which to split the PDF')
    parser.add_argument('--delete-bookmark', action='store_true', help='Omit the specitfied bookmark from the output PDFs')
    return parser.parse_args()

def pdf_detect_bookmark(input_file: str, bookmark_name: str) -> Tuple[str, int]:
    pdf_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
    if pdf_metadata.returncode == 0:
        # Search for the bookmark specified by the user and the next two lines
        match = re.search(rf'BookmarkTitle: {bookmark_name}\n.*\n.*', pdf_metadata.stdout)
        if match:
            # Split the matched string into lines
            lines = match.group(0).split('\n')

            # Loop through the lines and find the page number
            for i in range(len(lines)):
                if 'BookmarkPageNumber' in lines[i]:
                    split_at_page = int(lines[i].split(': ')[1])
                    print(f'Detected PDF bookmark "{bookmark_name}" on page {split_at_page}')
                    return pdf_metadata.stdout, split_at_page
        else:
            raise ValueError(f'Could not find PDF bookmark "{bookmark_name}" in {input_file}')
    else:
        print(pdf_metadata.stdout)
        raise RuntimeError(f'pdftk returned non-zero exit code {pdf_metadata.returncode}')

def pdf_metadata_split_bookmarks(input_metadata: str, split_at_page: int, delete_bookmark: Optional[str] = None) -> Tuple[str, str]:
    metadata_list = input_metadata.split('\n')
    bookmarks1 = []
    bookmarks2 = []

    bookmark_block = []
    in_bookmark_block = False

    for line in metadata_list:
        if in_bookmark_block:
            if re.match('^BookmarkPageNumber: [0-9]+', line):

                # Obtain page number
                page_number = int(line.split(': ')[1])

                # Optionally delete the specified bookmark
                if page_number==split_at_page and delete_bookmark:
                    if bookmark_block[1] == f'BookmarkTitle: {delete_bookmark}':
                        bookmark_block = []
                        in_bookmark_block = False
                        continue

                # Split the bookmarks in two
                if page_number < split_at_page:
                    bookmark_block.append(line)
                    bookmarks1.extend(bookmark_block)
                else:
                    bookmark_block.append(f'BookmarkPageNumber: {page_number - split_at_page + 1}')
                    bookmarks2.extend(bookmark_block)

                # Reset the bookmark block
                bookmark_block = []
                in_bookmark_block = False
            else:
                bookmark_block.append(line)
        else:
            if re.match('^BookmarkBegin$', line):
                bookmark_block.append(line)
                in_bookmark_block = True

    # Ensure minimum bookmark level is 1
    bookmarks1 = pdf_metadata_bookmarklevel_normalize(bookmarks1)
    bookmarks2 = pdf_metadata_bookmarklevel_normalize(bookmarks2)

    return '\n'.join(bookmarks1), '\n'.join(bookmarks2)

def pdf_metadata_bookmarklevel_normalize(bookmarks: List[str]) -> List[str]:
    # Calculate minimum bookmark level
    min_bookmark_level = None
    for line in bookmarks:
        if re.match('^BookmarkLevel: [0-9]+', line):
            bookmark_level = int(line.split(': ')[1])
            if min_bookmark_level is None:
                min_bookmark_level = bookmark_level
            elif bookmark_level < min_bookmark_level:
                min_bookmark_level = bookmark_level

    # Ensure minimum bookmark level is 1
    if min_bookmark_level:
        for i in range(len(bookmarks)):
            if re.match('^BookmarkLevel: [0-9]+', bookmarks[i]):
                bookmark_level = int(bookmarks[i].split(': ')[1])
                bookmarks[i] = f'BookmarkLevel: {bookmark_level - min_bookmark_level + 1}'

    return bookmarks

def pdf_split_at_page(input_file: str, output_file1: str, output_file2: str, split_at_page: int) -> None:
    output_pdf_1 = subprocess.run(['pdftk', input_file, 'cat', f'1-{split_at_page - 1}', 'output', output_file1], capture_output=True, text=True)
    if output_pdf_1.returncode == 0:
        print(f'Created {output_file1} containing pages 1-{split_at_page - 1} of {input_file}')
    else:
        print(output_pdf_1.stdout)
        raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_1.returncode}')

    output_pdf_2 = subprocess.run(['pdftk', input_file, 'cat', f'{split_at_page}-end', 'output', output_file2], capture_output=True, text=True)
    if output_pdf_2.returncode == 0:
        print(f'Created {output_file2} containing pages {split_at_page}-end of {input_file}')
    else:
        print(output_pdf_2.stdout)
        raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_2.returncode}')

def pdf_insert_bookmarks(input_file: str, bookmarks: str) -> None:

    # Obtain metadata for the PDF to be modified
    input_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
    if input_metadata.returncode == 0:

        # Verify that there are no bookmarks in the input PDF
        if 'BookmarkBegin' in input_metadata.stdout:
            raise ValueError(f'Input PDF {input_file} already contains bookmarks')

        # Split the metadata into before/after bookmarks
        input_metadata_parts = [i.rstrip('\n') for i in input_metadata.stdout.split('PageMediaBegin', 1)]
        input_metadata_parts[1] = 'PageMediaBegin' + input_metadata_parts[1]

        # Insert the bookmarks into the metadata
        with tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_metadata, tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_pdf:
            temp_metadata.write(input_metadata_parts[0])
            temp_metadata.write('\n')
            temp_metadata.write(bookmarks)
            temp_metadata.write('\n')
            temp_metadata.write(input_metadata_parts[1])

            temp_metadata.seek(0)
            updated_pdf = subprocess.run(['pdftk', input_file, 'update_info', temp_metadata.name, 'output', temp_pdf.name], capture_output=True, text=True)

            if updated_pdf.returncode == 0:
                shutil.copy2(temp_pdf.name, input_file)
                print(f'Inserted PDF bookmarks into {input_file}')
            else:
                print(updated_pdf.stdout)
                raise RuntimeError(f'pdftk returned non-zero exit code {updated_pdf.returncode}')

    else:
        print(input_metadata.stdout)
        raise RuntimeError(f'pdftk returned non-zero exit code {input_metadata.returncode}')

def main() -> int:

    # Parse command line arguments
    args = parse_args()

    # Obtain metadata for the input PDF
    input_pdf_metadata, split_at_page = pdf_detect_bookmark(input_file=args.input, bookmark_name=args.split_at_bookmark)

    # Separate the bookmarks for the first and second output PDFs
    if args.delete_bookmark:
        bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page, delete_bookmark=args.split_at_bookmark)
    else:
        bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page)

    # Split the PDF into two files
    pdf_split_at_page(input_file=args.input, output_file1=args.output1, output_file2=args.output2, split_at_page=split_at_page)

    # Insert the bookmarks into the output PDFs
    pdf_insert_bookmarks(input_file=args.output1, bookmarks=bookmarks1)
    pdf_insert_bookmarks(input_file=args.output2, bookmarks=bookmarks2)

    return 0

if __name__ == '__main__':
    sys.exit(main())
	Zero-Clause BSD
	===============

	Permission to use, copy, modify, and/or distribute this software for
	any purpose with or without fee is hereby granted.

	THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
	WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
	OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
	FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
	DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
	AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	#!/usr/bin/env python3
	import argparse, re, shutil, subprocess, sys, tempfile
	from typing import List, Optional, Tuple

	def parse_args() -> argparse.Namespace:
	""" Parse command line arguments
	"""
	parser = argparse.ArgumentParser(description='Split a PDF into two at a specific PDF bookmark, preserving other PDF bookmarks.')
	parser = argparse.ArgumentParser(description='REQUIRES: pdftk installed and available on the command line path.')
	parser.add_argument('-i', '--input', required=True, type=str, help='The input PDF file to split in two')
	parser.add_argument('-o1', '--output1', required=True, type=str, help='The first output PDF file')
	parser.add_argument('-o2', '--output2', required=True, type=str, help='The second output PDF file')
	parser.add_argument('--split-at-bookmark', required=True, type=str, help='The name of the bookmark at which to split the PDF')
	parser.add_argument('--delete-bookmark', action='store_true', help='Omit the specitfied bookmark from the output PDFs')
	return parser.parse_args()

	def pdf_detect_bookmark(input_file: str, bookmark_name: str) -> Tuple[str, int]:
	pdf_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
	if pdf_metadata.returncode == 0:
	# Search for the bookmark specified by the user and the next two lines
	match = re.search(rf'BookmarkTitle: {bookmark_name}\n.\n.', pdf_metadata.stdout)
	if match:
	# Split the matched string into lines
	lines = match.group(0).split('\n')

	# Loop through the lines and find the page number
	for i in range(len(lines)):
	if 'BookmarkPageNumber' in lines[i]:
	split_at_page = int(lines[i].split(': ')[1])
	print(f'Detected PDF bookmark "{bookmark_name}" on page {split_at_page}')
	return pdf_metadata.stdout, split_at_page
	else:
	raise ValueError(f'Could not find PDF bookmark "{bookmark_name}" in {input_file}')
	else:
	print(pdf_metadata.stdout)
	raise RuntimeError(f'pdftk returned non-zero exit code {pdf_metadata.returncode}')

	def pdf_metadata_split_bookmarks(input_metadata: str, split_at_page: int, delete_bookmark: Optional[str] = None) -> Tuple[str, str]:
	metadata_list = input_metadata.split('\n')
	bookmarks1 = []
	bookmarks2 = []

	bookmark_block = []
	in_bookmark_block = False

	for line in metadata_list:
	if in_bookmark_block:
	if re.match('^BookmarkPageNumber: [0-9]+', line):

	# Obtain page number
	page_number = int(line.split(': ')[1])

	# Optionally delete the specified bookmark
	if page_number==split_at_page and delete_bookmark:
	if bookmark_block[1] == f'BookmarkTitle: {delete_bookmark}':
	bookmark_block = []
	in_bookmark_block = False
	continue

	# Split the bookmarks in two
	if page_number < split_at_page:
	bookmark_block.append(line)
	bookmarks1.extend(bookmark_block)
	else:
	bookmark_block.append(f'BookmarkPageNumber: {page_number - split_at_page + 1}')
	bookmarks2.extend(bookmark_block)

	# Reset the bookmark block
	bookmark_block = []
	in_bookmark_block = False
	else:
	bookmark_block.append(line)
	else:
	if re.match('^BookmarkBegin$', line):
	bookmark_block.append(line)
	in_bookmark_block = True

	# Ensure minimum bookmark level is 1
	bookmarks1 = pdf_metadata_bookmarklevel_normalize(bookmarks1)
	bookmarks2 = pdf_metadata_bookmarklevel_normalize(bookmarks2)

	return '\n'.join(bookmarks1), '\n'.join(bookmarks2)

	def pdf_metadata_bookmarklevel_normalize(bookmarks: List[str]) -> List[str]:
	# Calculate minimum bookmark level
	min_bookmark_level = None
	for line in bookmarks:
	if re.match('^BookmarkLevel: [0-9]+', line):
	bookmark_level = int(line.split(': ')[1])
	if min_bookmark_level is None:
	min_bookmark_level = bookmark_level
	elif bookmark_level < min_bookmark_level:
	min_bookmark_level = bookmark_level

	# Ensure minimum bookmark level is 1
	if min_bookmark_level:
	for i in range(len(bookmarks)):
	if re.match('^BookmarkLevel: [0-9]+', bookmarks[i]):
	bookmark_level = int(bookmarks[i].split(': ')[1])
	bookmarks[i] = f'BookmarkLevel: {bookmark_level - min_bookmark_level + 1}'

	return bookmarks

	def pdf_split_at_page(input_file: str, output_file1: str, output_file2: str, split_at_page: int) -> None:
	output_pdf_1 = subprocess.run(['pdftk', input_file, 'cat', f'1-{split_at_page - 1}', 'output', output_file1], capture_output=True, text=True)
	if output_pdf_1.returncode == 0:
	print(f'Created {output_file1} containing pages 1-{split_at_page - 1} of {input_file}')
	else:
	print(output_pdf_1.stdout)
	raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_1.returncode}')

	output_pdf_2 = subprocess.run(['pdftk', input_file, 'cat', f'{split_at_page}-end', 'output', output_file2], capture_output=True, text=True)
	if output_pdf_2.returncode == 0:
	print(f'Created {output_file2} containing pages {split_at_page}-end of {input_file}')
	else:
	print(output_pdf_2.stdout)
	raise RuntimeError(f'pdftk returned non-zero exit code {output_pdf_2.returncode}')

	def pdf_insert_bookmarks(input_file: str, bookmarks: str) -> None:

	# Obtain metadata for the PDF to be modified
	input_metadata = subprocess.run(['pdftk', input_file, 'dump_data'], capture_output=True, text=True)
	if input_metadata.returncode == 0:

	# Verify that there are no bookmarks in the input PDF
	if 'BookmarkBegin' in input_metadata.stdout:
	raise ValueError(f'Input PDF {input_file} already contains bookmarks')

	# Split the metadata into before/after bookmarks
	input_metadata_parts = [i.rstrip('\n') for i in input_metadata.stdout.split('PageMediaBegin', 1)]
	input_metadata_parts[1] = 'PageMediaBegin' + input_metadata_parts[1]

	# Insert the bookmarks into the metadata
	with tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_metadata, tempfile.NamedTemporaryFile(mode='w', delete=True) as temp_pdf:
	temp_metadata.write(input_metadata_parts[0])
	temp_metadata.write('\n')
	temp_metadata.write(bookmarks)
	temp_metadata.write('\n')
	temp_metadata.write(input_metadata_parts[1])

	temp_metadata.seek(0)
	updated_pdf = subprocess.run(['pdftk', input_file, 'update_info', temp_metadata.name, 'output', temp_pdf.name], capture_output=True, text=True)

	if updated_pdf.returncode == 0:
	shutil.copy2(temp_pdf.name, input_file)
	print(f'Inserted PDF bookmarks into {input_file}')
	else:
	print(updated_pdf.stdout)
	raise RuntimeError(f'pdftk returned non-zero exit code {updated_pdf.returncode}')

	else:
	print(input_metadata.stdout)
	raise RuntimeError(f'pdftk returned non-zero exit code {input_metadata.returncode}')

	def main() -> int:

	# Parse command line arguments
	args = parse_args()

	# Obtain metadata for the input PDF
	input_pdf_metadata, split_at_page = pdf_detect_bookmark(input_file=args.input, bookmark_name=args.split_at_bookmark)

	# Separate the bookmarks for the first and second output PDFs
	if args.delete_bookmark:
	bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page, delete_bookmark=args.split_at_bookmark)
	else:
	bookmarks1, bookmarks2 = pdf_metadata_split_bookmarks(input_metadata=input_pdf_metadata, split_at_page=split_at_page)

	# Split the PDF into two files
	pdf_split_at_page(input_file=args.input, output_file1=args.output1, output_file2=args.output2, split_at_page=split_at_page)

	# Insert the bookmarks into the output PDFs
	pdf_insert_bookmarks(input_file=args.output1, bookmarks=bookmarks1)
	pdf_insert_bookmarks(input_file=args.output2, bookmarks=bookmarks2)

	return 0

	if __name__ == '__main__':
	sys.exit(main())