Skip to content

Instantly share code, notes, and snippets.

@mhl
Created January 13, 2010 10:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhl/276099 to your computer and use it in GitHub Desktop.
Save mhl/276099 to your computer and use it in GitHub Desktop.
Split a PDF into pages with graphics and those without
#!/usr/bin/python2.5
# This script attempts to take a PDF file and split it into two PDF
# files, one of which has all the images and the other which has
# everything else. You can select pages by whether they contain
# /Subtype /Image (bitmaps in my case) or /XObject which seems to
# catch bitmaps and included PDF files (which are most diagrams
# exported to PDF in my case).
import os
import sys
import tempfile
from optparse import OptionParser
import subprocess
import glob
import re
# Save the original working directory:
owd = os.getcwd()
parser = OptionParser("Usage: %prog [options] <INPUT-PDF> <OUTPUT-WITH-IMAGES-PDF> <OUTPUT-WITH-TEXT-PDF>")
parser.add_option('-i', '--images', dest='images', action="store_true",
default=False, help='include pages with images')
parser.add_option('-x', '--xobjects', dest='xobjects', action="store_true",
default=False, help='include pages with XObjects')
parser.add_option('-v', '--verbose', dest='verbose', action="store_true",
default=False, help='include pages with XObjects')
(options, args) = parser.parse_args()
if not (options.images or options.xobjects):
print "You must specify one or more of --images or --xobjects"
sys.exit(1)
if not len(args) == 3:
parser.print_help()
sys.exit(1)
input_filename = os.path.realpath(args[0])
output_filename_images = os.path.realpath(args[1])
output_filename_text = os.path.realpath(args[2])
if options.verbose:
print "Creating temporary directory..."
temporary_directory = tempfile.mkdtemp()
if not temporary_directory:
print "Creating a temporary directory failed"
sys.exit(1)
# A method to get the page number from a filename, or return -1 if malformed:
def extract_page_number_from_filename(s):
m = re.search("(\d+)",s)
if m:
return int(m.group(1),10)
else:
return -1
try:
# (It doesn't seem possible to control where pdftk dumps the
# doc_data.txt file, so change to the temporary directory.)
os.chdir(temporary_directory)
if options.verbose:
print "Splitting the PDF (%s) into pages..." % (input_filename,)
result = subprocess.call(["pdftk",input_filename,"burst","output","page_%09d.pdf"])
if result != 0:
print "Splitting the PDF (%s) into pages with pdftk failed" % (input_filename,)
sys.exit(3)
text_pages = []
image_pages = []
page_filenames = glob.glob("page_*.pdf")
page_filenames.sort( key=extract_page_number_from_filename )
for page_filename in page_filenames:
page_number = extract_page_number_from_filename(page_filename)
# Now extract the uncompressed version of that single page:
if options.verbose:
print "Examining page %d (%s)" % (page_number,page_filename)
uncompressed = subprocess.Popen(["pdftk",page_filename,"output","-","uncompress"], stdout=subprocess.PIPE).communicate()[0]
if options.images and re.search('/Subtype /Image',uncompressed):
image_pages.append(page_number)
elif options.xobjects and re.search('/XObject',uncompressed):
image_pages.append(page_number)
else:
text_pages.append(page_number)
# So now call pdftk twice, once to extract the image pages, and
# once to extract the text pages:
command_start = ["pdftk",input_filename,"cat"]
command_end = ["output"]
extract_images_command = command_start + map(str,image_pages) + command_end + [ output_filename_images ]
extract_text_command = command_start + map(str,text_pages) + command_end + [ output_filename_text ]
if options.verbose:
print "Concatenating pages with images to: %s)" % (output_filename_images,)
result = subprocess.call(extract_images_command)
if result != 0:
print "Extracting the pages with images failed; the command was:"
print " "+" ".join(extract_images_command)
sys.exit(4)
if options.verbose:
print "Concatenating pages with text to: %s)" % (output_filename_text,)
result = subprocess.call(extract_text_command)
if result != 0:
print "Extracting the pages with text failed; the command was:"
print " "+" ".join(extract_text_command)
sys.exit(5)
finally:
os.chdir(owd)
if options.verbose:
print "Removing the temporary directory: %s" % (temporary_directory,)
subprocess.call(["rm","-rf",temporary_directory])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment