kylepls/pdf_merge.py

## pdf_merge.py
import os
import re
import sys

import PyPDF2

MIN_PYTHON = (3, 0)


"""
Usage: python pdf_merge.py [inputs] <output>
Example: python pdf_merge.py *.pdf merged.pdf
"""


def natural_sort_key(s: str):
    """
    Function to convert a string into a sortable entry that respects natural ordering.
    I.e.: 1 comes before 10
    """
    return [int(text) if text.isdigit() else text.lower() for text in re.compile('([0-9]+)').split(s)]


def get_pdf_title(pdf_file_path: str) -> str:
    pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, 'rb'))
    pdf_title = pdf_reader.getDocumentInfo().title
    if pdf_title is None:
        pdf_title = os.path.basename(pdf_file_path) \
            .replace("_", " ") \
            .replace(".pdf", "")
    return pdf_title


def merge_pdfs(pdfs: [str], target_file: str) -> None:
    merger = PyPDF2.PdfFileMerger()

    if os.path.exists(target_file):
        os.remove(target_file)

    pdfs.sort(key=natural_sort_key)

    titles = [get_pdf_title(pdf) for pdf in pdfs]
    for pdf, title in zip(pdfs, titles):
        merger.append(open(pdf, 'rb'), bookmark=title)

    with open(target_file, 'wb') as out:
        merger.write(out)
    merger.close()


if __name__ == '__main__':
    if sys.version_info < MIN_PYTHON:
        sys.exit('Python %s.%s or later is required.' % MIN_PYTHON)

    if len(sys.argv) < 3:
        print('Usage: %s [sources] <target>' % sys.argv[0])
        exit(1)

    sources = sys.argv[1:-1]
    non_pdfs = [pdf for pdf in sources if not pdf.endswith('.pdf')]
    if len(non_pdfs) != 0:
        print('One or more input files are not a PDF:', non_pdfs)
        exit(1)

    not_exists = [pdf for pdf in sources if not os.path.exists(pdf)]
    if len(not_exists) != 0:
        print('One or more input files do not exist:', not_exists)
        exit(1)

    target_file = sys.argv[-1]
    if not target_file.endswith('.pdf'):
        print('Target file must have a .pdf extension')
        exit(1)

    merge_pdfs(sources, target_file)
    print('Wrote to', target_file)
    exit(0)
	import os
	import re
	import sys

	import PyPDF2

	MIN_PYTHON = (3, 0)


	"""
	Usage: python pdf_merge.py [inputs] <output>
	Example: python pdf_merge.py *.pdf merged.pdf
	"""


	def natural_sort_key(s: str):
	"""
	Function to convert a string into a sortable entry that respects natural ordering.
	I.e.: 1 comes before 10
	"""
	return [int(text) if text.isdigit() else text.lower() for text in re.compile('([0-9]+)').split(s)]


	def get_pdf_title(pdf_file_path: str) -> str:
	pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, 'rb'))
	pdf_title = pdf_reader.getDocumentInfo().title
	if pdf_title is None:
	pdf_title = os.path.basename(pdf_file_path) \
	.replace("_", " ") \
	.replace(".pdf", "")
	return pdf_title


	def merge_pdfs(pdfs: [str], target_file: str) -> None:
	merger = PyPDF2.PdfFileMerger()

	if os.path.exists(target_file):
	os.remove(target_file)

	pdfs.sort(key=natural_sort_key)

	titles = [get_pdf_title(pdf) for pdf in pdfs]
	for pdf, title in zip(pdfs, titles):
	merger.append(open(pdf, 'rb'), bookmark=title)

	with open(target_file, 'wb') as out:
	merger.write(out)
	merger.close()


	if __name__ == '__main__':
	if sys.version_info < MIN_PYTHON:
	sys.exit('Python %s.%s or later is required.' % MIN_PYTHON)

	if len(sys.argv) < 3:
	print('Usage: %s [sources] <target>' % sys.argv[0])
	exit(1)

	sources = sys.argv[1:-1]
	non_pdfs = [pdf for pdf in sources if not pdf.endswith('.pdf')]
	if len(non_pdfs) != 0:
	print('One or more input files are not a PDF:', non_pdfs)
	exit(1)

	not_exists = [pdf for pdf in sources if not os.path.exists(pdf)]
	if len(not_exists) != 0:
	print('One or more input files do not exist:', not_exists)
	exit(1)

	target_file = sys.argv[-1]
	if not target_file.endswith('.pdf'):
	print('Target file must have a .pdf extension')
	exit(1)

	merge_pdfs(sources, target_file)
	print('Wrote to', target_file)
	exit(0)