Skip to content

Instantly share code, notes, and snippets.

@sgrontflix
Last active August 18, 2022 16:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sgrontflix/e31a32721533fb07e821aba9440d813a to your computer and use it in GitHub Desktop.
Save sgrontflix/e31a32721533fb07e821aba9440d813a to your computer and use it in GitHub Desktop.
Simple script that allows you to merge PDF files. You can also specify which pages to consider when merging.
import argparse
import re
from pathlib import Path
from PyPDF2 import PdfMerger
from PyPDF2.errors import PyPdfError
def parse_pages(pages_list, files):
# no ranges => select all pages
# "None" means "all pages" in pypdf2
if not pages_list:
return [[None]] * len(files)
r = re.compile(r'^(\(((?:(-?\d+ )?(-?\d+,-?\d+ )?(all )?)*)((-?\d+)|(-?\d+,-?\d+)|all)\))|all$')
# will contain lists of ranges
# where each list is associated with a file
# e.g. [ [(0, 1), (2, 4)], [(1, 3)] ]
parsed_pages = []
for file, pages in zip(files, pages_list):
if not r.match(pages):
print(f'Syntax error: \"{pages}\".')
return None
# will contain parsed ranges for the current file
# e.g. [ (0, 1), (2, 4) ]
file_pages = []
for p in pages.replace('(', '').replace(')', '').split(' '):
if p == 'all':
file_pages.append(None)
continue
# we can either have a single page number (x) or a range (x-y)
try:
file_pages.append((int(p), int(p) + 1))
except ValueError:
to_add = tuple(int(n) for n in p.split(','))
if to_add[0] >= to_add[1]:
print(f'Start page is greater than or equal to final page in \"{p}\" for file \"{file}\".')
return None
file_pages.append(to_add)
parsed_pages.append(file_pages)
return parsed_pages
def parse_arguments():
parser = argparse.ArgumentParser(description='Simple PDF merger', usage='pdf_merge.py [-h] [-p [PAGES_LIST ...]] '
'[-o OUTFILE] file1 [other_files ...]')
parser.add_argument('files', nargs='+', help='List of files to merge\n'
'You can specify a single file '
'if you only want to extract certain pages')
parser.add_argument('-p', '--pages-list', nargs='*', help='Pages to consider when merging:\n'
'\"(0,2)\" => first two pages\n'
'\"(0 2,5)\" => first page + pages from third to fifth\n'
'\"(-1)\" => last page\n'
'\"all\" => the whole document')
parser.add_argument('-o', '--outfile', default='merged.pdf', help='Name of output file')
parser.formatter_class = argparse.RawTextHelpFormatter
args = parser.parse_args()
files, pages_list, outfile = args.files, args.pages_list, args.outfile
if pages_list and len(pages_list) > len(files):
print(f'Number of ranges is greater than number of files. '
f'Last {len(pages_list)-len(files)} range(s) will be ignored.')
pages_list = parse_pages(pages_list[:len(files)-len(pages_list)], files)
elif pages_list and len(pages_list) < len(files):
print(f'Number of ranges is less than number of files. '
f'Last {len(files)-len(pages_list)} file(s) will be treated in full.')
pages_list = parse_pages(pages_list, files)
pages_list and pages_list.extend([[None]] * (len(files)-len(pages_list)))
else:
pages_list = parse_pages(pages_list, files)
if not re.match(r'^.*\.pdf$', outfile):
outfile = outfile + '.pdf'
if outfile in files:
print(f'Output file name ({outfile}) is the same as one of the specified files.')
outfile = None
return files, pages_list, outfile
def main():
files, pages_list, outfile = parse_arguments()
if not pages_list or not outfile:
return
with PdfMerger() as merger:
for file, pages in zip(files, pages_list):
if not file or not Path(file).is_file():
print(f'Invalid path: \"{file}\".')
return
for p in pages:
try:
merger.append(file, pages=p)
except PyPdfError:
print(f'Invalid file: \"{file}\".')
return
except IndexError:
print(f'Page range \"{p}\" out of bounds for file \"{file}\".')
return
try:
merger.write(outfile)
print(f'Files successfully merged into \"{outfile}\".')
except PyPdfError:
print('Couldn\'t merge files.')
except PermissionError:
print(f'Couldn\'t write to output file: permission denied.')
except OSError:
print(f'Invalid output file name: \"{outfile}\".')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment