Skip to content

Instantly share code, notes, and snippets.

@roderickm
Created April 14, 2017 16:44
Show Gist options
  • Save roderickm/4504d4e4a58770a850c428d1909747b7 to your computer and use it in GitHub Desktop.
Save roderickm/4504d4e4a58770a850c428d1909747b7 to your computer and use it in GitHub Desktop.
Combine PDFs into a single file, allowing a single page to be picked from the input files.
#! /usr/bin/python
#
# join
# Joining pages from a collection of PDF files into a single PDF file.
#
# join [--pickpage <pagenum>] [--output <file>] [--shuffle] [--verbose]"
#
# Parameter:
#
# --pickpage <pagenum>
# Pick only page number <pagenum> from each source file.
# If this option is not specified then all of the pages from a PDF file are appended
# to the output PDF file before the next input PDF file is processed.
#
# --shuffle
# Take a page from each PDF input file in turn before taking another from each file.
# If this option is not specified then all of the pages from a PDF file are appended
# to the output PDF file before the next input PDF file is processed.
#
# --verbose
# Write information about the doings of this tool to stderr.
#
import sys
import os
import getopt
import tempfile
import shutil
from CoreFoundation import *
from Quartz.CoreGraphics import *
verbose = False
def createPDFDocumentWithPath(path):
global verbose
if verbose:
print "Creating PDF document from file %s" % (path)
return CGPDFDocumentCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, path, len(path), False))
def writePageFromDoc(writeContext, doc, pageNum):
global verbose
page = CGPDFDocumentGetPage(doc, pageNum)
if page:
mediaBox = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
if CGRectIsEmpty(mediaBox):
mediaBox = None
CGContextBeginPage(writeContext, mediaBox)
CGContextDrawPDFPage(writeContext, page)
CGContextEndPage(writeContext)
if verbose:
print "Copied page %d from %s" % (pageNum, doc)
def shufflePages(writeContext, docs, maxPages):
for pageNum in xrange(1, maxPages + 1):
for doc in docs:
writePageFromDoc(writeContext, doc, pageNum)
def append(writeContext, docs, maxPages, pickpage=None):
for doc in docs:
if pickpage:
writePageFromDoc(writeContext, doc, pickpage)
else:
for pageNum in xrange(1, maxPages + 1) :
writePageFromDoc(writeContext, doc, pageNum)
def main(argv):
global verbose
# The PDF context we will draw into to create a new PDF
writeContext = None
# If True then generate more verbose information
source = None
shuffle = False
# Parse the command line options
try:
options, args = getopt.getopt(argv, "o:p:sv", ["output=", "pickpage=", "shuffle", "verbose"])
except getopt.GetoptError:
usage()
sys.exit(2)
for option, arg in options:
if option in ("-o", "--output") :
if verbose:
print "Setting %s as the destination." % (arg)
writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, arg, len(arg), False), None, None)
elif option in ("-p", "--pickpage") :
try:
pickpage = int(arg)
if verbose:
print "Picking page number %d from each input file." % (pickpage)
except Exception, e:
print str(e)
sys.exit(2)
elif option in ("-s", "--shuffle") :
if verbose :
print "Shuffle pages to the output file."
shuffle = True
elif option in ("-v", "--verbose") :
print "Verbose mode enabled."
verbose = True
else :
print "Unknown argument: %s" % (option)
if writeContext:
# create PDFDocuments for all of the files.
docs = map(createPDFDocumentWithPath, args)
# find the maximum number of pages.
maxPages = 0
for doc in docs:
if CGPDFDocumentGetNumberOfPages(doc) > maxPages:
maxPages = CGPDFDocumentGetNumberOfPages(doc)
if shuffle:
shufflePages(writeContext, docs, maxPages)
elif pickpage:
append(writeContext, docs, maxPages, pickpage)
else:
append(writeContext, docs, maxPages)
CGPDFContextClose(writeContext)
del writeContext
#CGContextRelease(writeContext)
def usage():
print "Usage: join [--pickpage <pagenum>] [--output <file>] [--shuffle] [--verbose]"
if __name__ == "__main__":
main(sys.argv[1:])
@roderickm
Copy link
Author

This is a modification of the join.py script included with OS X.
Find the original at /System/Library/Automator/Combine\ PDF\ Pages.action/Contents/Resources/join.py.

This script allows me to assemble expense reports easily, grabbing the first page from many statements and assembling them into a single file using the --pickpage 1 option.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment