Created
April 8, 2017 07:16
-
-
Save alvinwan/0181ccfea15a321b4cba7bea23669285 to your computer and use it in GitHub Desktop.
Splitting PDFs programmatically using a list of set ranges
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Demonstration of provided split utility. | |
Divides a PDF into several chunks and eliminates certain pages from each pdf. | |
""" | |
from splits import splits | |
INPUT_FILE = "./merged.pdf" | |
OUTPUT_DIRECTORY = "./out" | |
PAGE_SPLITS = "1-10,11-20,21-30,31-42,43-52,53-64,65-74,75-86,87-98,99-110,111-122,123-134,135-146,147-158,159-168,169-180,181-192,193-204,205-216,217-228,229-240,241-250,251-262,263-274,275-284,285-296,297-308,309-320,321-332,333-344,345-356,357-368,369-380,381-392,393-404,405-416" | |
def filter_pages_6_7(j: int, start: int, end: int) -> bool: | |
"""Filters out pages 6 and 7 for each PDF""" | |
page_index = j - start + 2 | |
num_pages = end - start + 1 | |
return num_pages > 10 and page_index in (6, 7) | |
def main(): | |
splits(INPUT_FILE, PAGE_SPLITS, OUTPUT_DIRECTORY, filter_pages_6_7) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Splits a PDF based on provided comma-separated page counts. | |
Below, the comma-separated splits may look like the following: | |
1-10, 11-20, 21-30 | |
The ranges overlap if desired. | |
Usage: | |
split.py <input> <splits> [options] | |
Options: | |
-o --out Output directory [default: ./] | |
""" | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
import docopt | |
from typing import Callable | |
import os.path | |
def splits(input_filename: str, sections: str, out_directory: str, | |
filter_pages: Callable[[int, int], bool]) -> None: | |
"""Splits the provided PDF given the provided comma-separated sections.""" | |
input1 = PdfFileReader(open(input_filename, "rb")) | |
for i, section in enumerate(sections.split(',')): | |
output = PdfFileWriter() | |
print(' * [INFO] Starting packet %d' % i) | |
start, end = map(int, section.strip().split('-')) | |
for j in range(start - 1, end): | |
if filter_pages(j, start, end): | |
continue | |
output.addPage(input1.getPage(j)) | |
os.makedirs(out_directory, exist_ok=True) | |
output_filename = os.path.join(out_directory, "output-%d.pdf" % i) | |
output_stream = open(output_filename, "wb") | |
output.write(output_stream) | |
def main(): | |
arguments = docopt.docopt(__doc__, version='1.0') | |
split(arguments['<input>'], arguments['<sections>'], arguments['--out']) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment