Skip to content

Instantly share code, notes, and snippets.

@alvinwan
Created April 8, 2017 07:16
Show Gist options
  • Save alvinwan/0181ccfea15a321b4cba7bea23669285 to your computer and use it in GitHub Desktop.
Save alvinwan/0181ccfea15a321b4cba7bea23669285 to your computer and use it in GitHub Desktop.
Splitting PDFs programmatically using a list of set ranges
"""Demonstration of provided split utility.
Divides a PDF into several chunks and eliminates certain pages from each pdf.
"""
from splits import splits
INPUT_FILE = "./merged.pdf"
OUTPUT_DIRECTORY = "./out"
PAGE_SPLITS = "1-10,11-20,21-30,31-42,43-52,53-64,65-74,75-86,87-98,99-110,111-122,123-134,135-146,147-158,159-168,169-180,181-192,193-204,205-216,217-228,229-240,241-250,251-262,263-274,275-284,285-296,297-308,309-320,321-332,333-344,345-356,357-368,369-380,381-392,393-404,405-416"
def filter_pages_6_7(j: int, start: int, end: int) -> bool:
"""Filters out pages 6 and 7 for each PDF"""
page_index = j - start + 2
num_pages = end - start + 1
return num_pages > 10 and page_index in (6, 7)
def main():
splits(INPUT_FILE, PAGE_SPLITS, OUTPUT_DIRECTORY, filter_pages_6_7)
if __name__ == '__main__':
main()
"""Splits a PDF based on provided comma-separated page counts.
Below, the comma-separated splits may look like the following:
1-10, 11-20, 21-30
The ranges overlap if desired.
Usage:
split.py <input> <splits> [options]
Options:
-o --out Output directory [default: ./]
"""
from PyPDF2 import PdfFileWriter, PdfFileReader
import docopt
from typing import Callable
import os.path
def splits(input_filename: str, sections: str, out_directory: str,
filter_pages: Callable[[int, int], bool]) -> None:
"""Splits the provided PDF given the provided comma-separated sections."""
input1 = PdfFileReader(open(input_filename, "rb"))
for i, section in enumerate(sections.split(',')):
output = PdfFileWriter()
print(' * [INFO] Starting packet %d' % i)
start, end = map(int, section.strip().split('-'))
for j in range(start - 1, end):
if filter_pages(j, start, end):
continue
output.addPage(input1.getPage(j))
os.makedirs(out_directory, exist_ok=True)
output_filename = os.path.join(out_directory, "output-%d.pdf" % i)
output_stream = open(output_filename, "wb")
output.write(output_stream)
def main():
arguments = docopt.docopt(__doc__, version='1.0')
split(arguments['<input>'], arguments['<sections>'], arguments['--out'])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment