Skip to content

Instantly share code, notes, and snippets.

@kylepls
Created November 15, 2020 23:47
Show Gist options
  • Save kylepls/b80ecc9107dc025b63a69e03dbf03dbb to your computer and use it in GitHub Desktop.
Save kylepls/b80ecc9107dc025b63a69e03dbf03dbb to your computer and use it in GitHub Desktop.
Merges several input PDFs into a single, large, PDF with bookmarks for each individual PDF.
import os
import re
import sys
import PyPDF2
MIN_PYTHON = (3, 0)
"""
Usage: python pdf_merge.py [inputs] <output>
Example: python pdf_merge.py *.pdf merged.pdf
"""
def natural_sort_key(s: str):
"""
Function to convert a string into a sortable entry that respects natural ordering.
I.e.: 1 comes before 10
"""
return [int(text) if text.isdigit() else text.lower() for text in re.compile('([0-9]+)').split(s)]
def get_pdf_title(pdf_file_path: str) -> str:
pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, 'rb'))
pdf_title = pdf_reader.getDocumentInfo().title
if pdf_title is None:
pdf_title = os.path.basename(pdf_file_path) \
.replace("_", " ") \
.replace(".pdf", "")
return pdf_title
def merge_pdfs(pdfs: [str], target_file: str) -> None:
merger = PyPDF2.PdfFileMerger()
if os.path.exists(target_file):
os.remove(target_file)
pdfs.sort(key=natural_sort_key)
titles = [get_pdf_title(pdf) for pdf in pdfs]
for pdf, title in zip(pdfs, titles):
merger.append(open(pdf, 'rb'), bookmark=title)
with open(target_file, 'wb') as out:
merger.write(out)
merger.close()
if __name__ == '__main__':
if sys.version_info < MIN_PYTHON:
sys.exit('Python %s.%s or later is required.' % MIN_PYTHON)
if len(sys.argv) < 3:
print('Usage: %s [sources] <target>' % sys.argv[0])
exit(1)
sources = sys.argv[1:-1]
non_pdfs = [pdf for pdf in sources if not pdf.endswith('.pdf')]
if len(non_pdfs) != 0:
print('One or more input files are not a PDF:', non_pdfs)
exit(1)
not_exists = [pdf for pdf in sources if not os.path.exists(pdf)]
if len(not_exists) != 0:
print('One or more input files do not exist:', not_exists)
exit(1)
target_file = sys.argv[-1]
if not target_file.endswith('.pdf'):
print('Target file must have a .pdf extension')
exit(1)
merge_pdfs(sources, target_file)
print('Wrote to', target_file)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment