Created
November 15, 2020 23:47
-
-
Save kylepls/b80ecc9107dc025b63a69e03dbf03dbb to your computer and use it in GitHub Desktop.
Merges several input PDFs into a single, large, PDF with bookmarks for each individual PDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import PyPDF2 | |
MIN_PYTHON = (3, 0) | |
""" | |
Usage: python pdf_merge.py [inputs] <output> | |
Example: python pdf_merge.py *.pdf merged.pdf | |
""" | |
def natural_sort_key(s: str): | |
""" | |
Function to convert a string into a sortable entry that respects natural ordering. | |
I.e.: 1 comes before 10 | |
""" | |
return [int(text) if text.isdigit() else text.lower() for text in re.compile('([0-9]+)').split(s)] | |
def get_pdf_title(pdf_file_path: str) -> str: | |
pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, 'rb')) | |
pdf_title = pdf_reader.getDocumentInfo().title | |
if pdf_title is None: | |
pdf_title = os.path.basename(pdf_file_path) \ | |
.replace("_", " ") \ | |
.replace(".pdf", "") | |
return pdf_title | |
def merge_pdfs(pdfs: [str], target_file: str) -> None: | |
merger = PyPDF2.PdfFileMerger() | |
if os.path.exists(target_file): | |
os.remove(target_file) | |
pdfs.sort(key=natural_sort_key) | |
titles = [get_pdf_title(pdf) for pdf in pdfs] | |
for pdf, title in zip(pdfs, titles): | |
merger.append(open(pdf, 'rb'), bookmark=title) | |
with open(target_file, 'wb') as out: | |
merger.write(out) | |
merger.close() | |
if __name__ == '__main__': | |
if sys.version_info < MIN_PYTHON: | |
sys.exit('Python %s.%s or later is required.' % MIN_PYTHON) | |
if len(sys.argv) < 3: | |
print('Usage: %s [sources] <target>' % sys.argv[0]) | |
exit(1) | |
sources = sys.argv[1:-1] | |
non_pdfs = [pdf for pdf in sources if not pdf.endswith('.pdf')] | |
if len(non_pdfs) != 0: | |
print('One or more input files are not a PDF:', non_pdfs) | |
exit(1) | |
not_exists = [pdf for pdf in sources if not os.path.exists(pdf)] | |
if len(not_exists) != 0: | |
print('One or more input files do not exist:', not_exists) | |
exit(1) | |
target_file = sys.argv[-1] | |
if not target_file.endswith('.pdf'): | |
print('Target file must have a .pdf extension') | |
exit(1) | |
merge_pdfs(sources, target_file) | |
print('Wrote to', target_file) | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment