mhl/pdf-pages-with-graphics.py

## pdf-pages-with-graphics.py
#!/usr/bin/python2.5

# This script attempts to take a PDF file and split it into two PDF
# files, one of which has all the images and the other which has
# everything else.  You can select pages by whether they contain
# /Subtype /Image (bitmaps in my case) or /XObject which seems to
# catch bitmaps and included PDF files (which are most diagrams
# exported to PDF in my case).

import os
import sys
import tempfile
from optparse import OptionParser
import subprocess
import glob
import re

# Save the original working directory:
owd = os.getcwd()

parser = OptionParser("Usage: %prog [options] <INPUT-PDF> <OUTPUT-WITH-IMAGES-PDF> <OUTPUT-WITH-TEXT-PDF>")
parser.add_option('-i', '--images', dest='images', action="store_true",
                                    default=False, help='include pages with images')
parser.add_option('-x', '--xobjects', dest='xobjects', action="store_true",
                                    default=False, help='include pages with XObjects')
parser.add_option('-v', '--verbose', dest='verbose', action="store_true",
                                    default=False, help='include pages with XObjects')

(options, args) = parser.parse_args()

if not (options.images or options.xobjects):
    print "You must specify one or more of --images or --xobjects"
    sys.exit(1)

if not len(args) == 3:
    parser.print_help()
    sys.exit(1)

input_filename = os.path.realpath(args[0])
output_filename_images = os.path.realpath(args[1])
output_filename_text = os.path.realpath(args[2])

if options.verbose:
    print "Creating temporary directory..."
temporary_directory = tempfile.mkdtemp()
if not temporary_directory:
    print "Creating a temporary directory failed"
    sys.exit(1)

# A method to get the page number from a filename, or return -1 if malformed:
def extract_page_number_from_filename(s):
    m = re.search("(\d+)",s)
    if m:
        return int(m.group(1),10)
    else:
        return -1

try:
    # (It doesn't seem possible to control where pdftk dumps the
    # doc_data.txt file, so change to the temporary directory.)
    os.chdir(temporary_directory)

    if options.verbose:
        print "Splitting the PDF (%s) into pages..." % (input_filename,)
    result = subprocess.call(["pdftk",input_filename,"burst","output","page_%09d.pdf"])
    if result != 0:
        print "Splitting the PDF (%s) into pages with pdftk failed" % (input_filename,)
        sys.exit(3)

    text_pages = []
    image_pages = []

    page_filenames = glob.glob("page_*.pdf")
    page_filenames.sort( key=extract_page_number_from_filename )
    for page_filename in page_filenames:
        page_number = extract_page_number_from_filename(page_filename)
        # Now extract the uncompressed version of that single page:
        if options.verbose:
            print "Examining page %d (%s)" % (page_number,page_filename)
        uncompressed = subprocess.Popen(["pdftk",page_filename,"output","-","uncompress"], stdout=subprocess.PIPE).communicate()[0]
        if options.images and re.search('/Subtype /Image',uncompressed):
            image_pages.append(page_number)
        elif options.xobjects and re.search('/XObject',uncompressed):
            image_pages.append(page_number)
        else:
            text_pages.append(page_number)

    # So now call pdftk twice, once to extract the image pages, and
    # once to extract the text pages:

    command_start = ["pdftk",input_filename,"cat"]
    command_end = ["output"]

    extract_images_command = command_start + map(str,image_pages) + command_end + [ output_filename_images ]
    extract_text_command = command_start + map(str,text_pages) + command_end + [ output_filename_text ]

    if options.verbose:
        print "Concatenating pages with images to: %s)" % (output_filename_images,)
    result = subprocess.call(extract_images_command)
    if result != 0:
        print "Extracting the pages with images failed; the command was:"
        print "  "+" ".join(extract_images_command)
        sys.exit(4)

    if options.verbose:
        print "Concatenating pages with text to: %s)" % (output_filename_text,)
    result = subprocess.call(extract_text_command)
    if result != 0:
        print "Extracting the pages with text failed; the command was:"
        print "  "+" ".join(extract_text_command)
        sys.exit(5)

finally:
    os.chdir(owd)
    if options.verbose:
        print "Removing the temporary directory: %s" % (temporary_directory,)
    subprocess.call(["rm","-rf",temporary_directory])
	#!/usr/bin/python2.5

	# This script attempts to take a PDF file and split it into two PDF
	# files, one of which has all the images and the other which has
	# everything else. You can select pages by whether they contain
	# /Subtype /Image (bitmaps in my case) or /XObject which seems to
	# catch bitmaps and included PDF files (which are most diagrams
	# exported to PDF in my case).

	import os
	import sys
	import tempfile
	from optparse import OptionParser
	import subprocess
	import glob
	import re

	# Save the original working directory:
	owd = os.getcwd()

	parser = OptionParser("Usage: %prog [options] <INPUT-PDF> <OUTPUT-WITH-IMAGES-PDF> <OUTPUT-WITH-TEXT-PDF>")
	parser.add_option('-i', '--images', dest='images', action="store_true",
	default=False, help='include pages with images')
	parser.add_option('-x', '--xobjects', dest='xobjects', action="store_true",
	default=False, help='include pages with XObjects')
	parser.add_option('-v', '--verbose', dest='verbose', action="store_true",
	default=False, help='include pages with XObjects')

	(options, args) = parser.parse_args()

	if not (options.images or options.xobjects):
	print "You must specify one or more of --images or --xobjects"
	sys.exit(1)

	if not len(args) == 3:
	parser.print_help()
	sys.exit(1)

	input_filename = os.path.realpath(args[0])
	output_filename_images = os.path.realpath(args[1])
	output_filename_text = os.path.realpath(args[2])

	if options.verbose:
	print "Creating temporary directory..."
	temporary_directory = tempfile.mkdtemp()
	if not temporary_directory:
	print "Creating a temporary directory failed"
	sys.exit(1)

	# A method to get the page number from a filename, or return -1 if malformed:
	def extract_page_number_from_filename(s):
	m = re.search("(\d+)",s)
	if m:
	return int(m.group(1),10)
	else:
	return -1

	try:
	# (It doesn't seem possible to control where pdftk dumps the
	# doc_data.txt file, so change to the temporary directory.)
	os.chdir(temporary_directory)

	if options.verbose:
	print "Splitting the PDF (%s) into pages..." % (input_filename,)
	result = subprocess.call(["pdftk",input_filename,"burst","output","page_%09d.pdf"])
	if result != 0:
	print "Splitting the PDF (%s) into pages with pdftk failed" % (input_filename,)
	sys.exit(3)

	text_pages = []
	image_pages = []

	page_filenames = glob.glob("page_*.pdf")
	page_filenames.sort( key=extract_page_number_from_filename )
	for page_filename in page_filenames:
	page_number = extract_page_number_from_filename(page_filename)
	# Now extract the uncompressed version of that single page:
	if options.verbose:
	print "Examining page %d (%s)" % (page_number,page_filename)
	uncompressed = subprocess.Popen(["pdftk",page_filename,"output","-","uncompress"], stdout=subprocess.PIPE).communicate()[0]
	if options.images and re.search('/Subtype /Image',uncompressed):
	image_pages.append(page_number)
	elif options.xobjects and re.search('/XObject',uncompressed):
	image_pages.append(page_number)
	else:
	text_pages.append(page_number)

	# So now call pdftk twice, once to extract the image pages, and
	# once to extract the text pages:

	command_start = ["pdftk",input_filename,"cat"]
	command_end = ["output"]

	extract_images_command = command_start + map(str,image_pages) + command_end + [ output_filename_images ]
	extract_text_command = command_start + map(str,text_pages) + command_end + [ output_filename_text ]

	if options.verbose:
	print "Concatenating pages with images to: %s)" % (output_filename_images,)
	result = subprocess.call(extract_images_command)
	if result != 0:
	print "Extracting the pages with images failed; the command was:"
	print " "+" ".join(extract_images_command)
	sys.exit(4)

	if options.verbose:
	print "Concatenating pages with text to: %s)" % (output_filename_text,)
	result = subprocess.call(extract_text_command)
	if result != 0:
	print "Extracting the pages with text failed; the command was:"
	print " "+" ".join(extract_text_command)
	sys.exit(5)

	finally:
	os.chdir(owd)
	if options.verbose:
	print "Removing the temporary directory: %s" % (temporary_directory,)
	subprocess.call(["rm","-rf",temporary_directory])