Skip to content

Instantly share code, notes, and snippets.

@jin-zhe
Created March 10, 2020 13:03
Show Gist options
  • Save jin-zhe/2efc348f58002f54e1ed90ab5323e56a to your computer and use it in GitHub Desktop.
Save jin-zhe/2efc348f58002f54e1ed90ab5323e56a to your computer and use it in GitHub Desktop.
Python script to split a directory of pdfs into smaller pdfs
"""
Intended usage scenario:
You have a directory of pdfs, each comprising of sequential image scans of
human-annotated documents (e.g. written questionaries/forms/exams) where every
document share the same number of pages. Each pdf may contain different
numbers of such scanned documents. You want to split all these pdfs up into
smaller pdfs at fixed page index intervals such that each smaller pdf
correspond to a single scanned document. In addition, you want to place them
place them under a specific output directory while ensuring no filename
collisons.
"""
__author__ = "Jin Zhe"
from datetime import datetime
from tqdm import tqdm
import argparse
import PyPDF2
import sys
import os
# Generates unique filename for each pdf split
def generate_unique_filename():
filename_fmt = "{}.pdf"
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
return filename_fmt.format(current_timestamp)
def main():
parser = argparse.ArgumentParser(description="Splits every pdf from the input directory into the output directory.")
parser.add_argument('--input_dir', '-in', type=str, required=True, help='The input directory containing one or more pdfs.')
parser.add_argument('--output_dir', '-out', type=str, required=True, help='The output directory to store the split pdfs.')
parser.add_argument('--page_stride', '-s', type=int, required=True, default=1, help='The number of pages in each split pdf.')
parser.add_argument('--drop_last', '-d', action='store_true', default=False, help='Ignore last split if its page length is less than page stride')
args = parser.parse_args()
input_dir = os.path.abspath(args.input_dir)
output_dir = os.path.abspath(args.output_dir)
page_stride = args.page_stride
drop_last = args.drop_last
# For each pdf in input directory to split
pdf_filenames = list(filter(lambda f: f.endswith('.pdf'), os.listdir(input_dir)))
for pdf_path in [os.path.join(input_dir, x) for x in pdf_filenames]:
input_f = open(pdf_path, 'rb')
pdf_in = PyPDF2.PdfFileReader(input_f)
num_pages = pdf_in.getNumPages()
page_indices = range(0,num_pages)
splits = [page_indices[i:i+page_stride] for i in range(0, num_pages, page_stride)]
# If last split is not full size and we indicated to drop the last split
if len(splits[-1]) < page_stride and drop_last:
del splits[-1]
print('Splitting {} into {} pdfs'.format(pdf_path, len(splits)))
# For each split in current pdf
for split in tqdm(splits):
# Determine filename for new pdf
output_path = os.path.join(output_dir, generate_unique_filename())
if os.path.exists(output_path):
raise IOError("File {} already exists!".format(output_path))
# Add each page to new pdf
output_f = open(output_path, 'wb')
pdf_out = PyPDF2.PdfFileWriter()
for page_index in split:
pdf_out.addPage(pdf_in.getPage(page_index))
# Write new pdf to file
pdf_out.write(output_f)
output_f.close()
print()
input_f.close()
if __name__ == "__main__": main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment