jin-zhe/split_pdfs.py

## split_pdfs.py
"""
  Intended usage scenario:
  You have a directory of pdfs, each comprising of sequential image scans of
  human-annotated documents (e.g. written questionaries/forms/exams) where every
  document share the same number of pages. Each pdf may contain different
  numbers of such scanned documents. You want to split all these pdfs up into
  smaller pdfs at fixed page index intervals such that each smaller pdf
  correspond to a single scanned document. In addition, you want to place them
  place them under a specific output directory while ensuring no filename
  collisons.
"""
__author__ = "Jin Zhe"

from datetime import datetime
from tqdm import tqdm
import argparse
import PyPDF2
import sys
import os

# Generates unique filename for each pdf split
def generate_unique_filename():
  filename_fmt = "{}.pdf"
  current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
  return filename_fmt.format(current_timestamp)

def main():
  parser = argparse.ArgumentParser(description="Splits every pdf from the input directory into the output directory.")
  parser.add_argument('--input_dir', '-in', type=str, required=True, help='The input directory containing one or more pdfs.')
  parser.add_argument('--output_dir', '-out', type=str, required=True, help='The output directory to store the split pdfs.')
  parser.add_argument('--page_stride', '-s', type=int, required=True, default=1, help='The number of pages in each split pdf.')
  parser.add_argument('--drop_last', '-d', action='store_true', default=False, help='Ignore last split if its page length is less than page stride')
  args = parser.parse_args()
  input_dir = os.path.abspath(args.input_dir)
  output_dir = os.path.abspath(args.output_dir)
  page_stride = args.page_stride
  drop_last = args.drop_last

  # For each pdf in input directory to split
  pdf_filenames = list(filter(lambda f: f.endswith('.pdf'), os.listdir(input_dir)))
  for pdf_path in [os.path.join(input_dir, x) for x in pdf_filenames]:
    input_f = open(pdf_path, 'rb')
    pdf_in = PyPDF2.PdfFileReader(input_f)
    num_pages = pdf_in.getNumPages()
    page_indices = range(0,num_pages)
    splits = [page_indices[i:i+page_stride] for i in range(0, num_pages, page_stride)]

    # If last split is not full size and we indicated to drop the last split
    if len(splits[-1]) < page_stride and drop_last:
      del splits[-1]

    print('Splitting {} into {} pdfs'.format(pdf_path, len(splits)))

    # For each split in current pdf
    for split in tqdm(splits):
      # Determine filename for new pdf
      output_path = os.path.join(output_dir, generate_unique_filename())
      if os.path.exists(output_path):
        raise IOError("File {} already exists!".format(output_path))

      # Add each page to new pdf
      output_f = open(output_path, 'wb')
      pdf_out = PyPDF2.PdfFileWriter()
      for page_index in split:
        pdf_out.addPage(pdf_in.getPage(page_index))

      # Write new pdf to file
      pdf_out.write(output_f)
      output_f.close()
    print()

    input_f.close()

if __name__ == "__main__": main()
	"""
	Intended usage scenario:
	You have a directory of pdfs, each comprising of sequential image scans of
	human-annotated documents (e.g. written questionaries/forms/exams) where every
	document share the same number of pages. Each pdf may contain different
	numbers of such scanned documents. You want to split all these pdfs up into
	smaller pdfs at fixed page index intervals such that each smaller pdf
	correspond to a single scanned document. In addition, you want to place them
	place them under a specific output directory while ensuring no filename
	collisons.
	"""
	__author__ = "Jin Zhe"

	from datetime import datetime
	from tqdm import tqdm
	import argparse
	import PyPDF2
	import sys
	import os

	# Generates unique filename for each pdf split
	def generate_unique_filename():
	filename_fmt = "{}.pdf"
	current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
	return filename_fmt.format(current_timestamp)

	def main():
	parser = argparse.ArgumentParser(description="Splits every pdf from the input directory into the output directory.")
	parser.add_argument('--input_dir', '-in', type=str, required=True, help='The input directory containing one or more pdfs.')
	parser.add_argument('--output_dir', '-out', type=str, required=True, help='The output directory to store the split pdfs.')
	parser.add_argument('--page_stride', '-s', type=int, required=True, default=1, help='The number of pages in each split pdf.')
	parser.add_argument('--drop_last', '-d', action='store_true', default=False, help='Ignore last split if its page length is less than page stride')
	args = parser.parse_args()
	input_dir = os.path.abspath(args.input_dir)
	output_dir = os.path.abspath(args.output_dir)
	page_stride = args.page_stride
	drop_last = args.drop_last

	# For each pdf in input directory to split
	pdf_filenames = list(filter(lambda f: f.endswith('.pdf'), os.listdir(input_dir)))
	for pdf_path in [os.path.join(input_dir, x) for x in pdf_filenames]:
	input_f = open(pdf_path, 'rb')
	pdf_in = PyPDF2.PdfFileReader(input_f)
	num_pages = pdf_in.getNumPages()
	page_indices = range(0,num_pages)
	splits = [page_indices[i:i+page_stride] for i in range(0, num_pages, page_stride)]

	# If last split is not full size and we indicated to drop the last split
	if len(splits[-1]) < page_stride and drop_last:
	del splits[-1]

	print('Splitting {} into {} pdfs'.format(pdf_path, len(splits)))

	# For each split in current pdf
	for split in tqdm(splits):
	# Determine filename for new pdf
	output_path = os.path.join(output_dir, generate_unique_filename())
	if os.path.exists(output_path):
	raise IOError("File {} already exists!".format(output_path))

	# Add each page to new pdf
	output_f = open(output_path, 'wb')
	pdf_out = PyPDF2.PdfFileWriter()
	for page_index in split:
	pdf_out.addPage(pdf_in.getPage(page_index))

	# Write new pdf to file
	pdf_out.write(output_f)
	output_f.close()
	print()

	input_f.close()

	if __name__ == "__main__": main()