dacc/gist:4187694

## gistfile1.py
#!/usr/bin/python
#
# Trims a PDF made from a Powerpoint presentation, removing slides that share all
# non-whitespace pixels with the one in front of them. i.e. Only retains "key"
# slides, dropping transitional ones whose content is redundant.
#

from sys import argv
from numpy import array, fromstring
from itertools import islice
from CoreGraphics import *
from Quartz import *

COLORSPACE = CGColorSpaceCreateDeviceRGB()
BYTES_PER_PIXEL = 4
WHITESPACE_PIXEL = array((255, 255, 255, 255))

def file_url_for_path(path):
    return CFURLCreateWithFileSystemPath(None, path, kCFURLPOSIXPathStyle, 0)

def get_page(doc, number):
    return CGPDFDocumentGetPage(doc, number)

def get_crop_box(page):
    return CGPDFPageGetBoxRect(page, kCGPDFCropBox)

def get_pages(doc):
    count = CGPDFDocumentGetNumberOfPages(doc)
    for number in range(1, count):
        yield number, get_page(doc, number)

def get_binary_bitmap(page):
    '''
    Return an binary array representing all pixels in the page, where zero represents
    a whitespace pixel and one represents a non-whitespace pixel.
    '''
    bounds = get_crop_box(page)

    width = int(bounds.size.width)
    height = int(bounds.size.height)

    context = CGBitmapContextCreate(None, width, height, 8, BYTES_PER_PIXEL * width,
        COLORSPACE, kCGImageAlphaNoneSkipLast)
    CGContextDrawPDFPage(context, page)

    total_bytes = width * height * BYTES_PER_PIXEL
    byte_tuple = CGBitmapContextGetData(context).as_tuple(total_bytes)
    byte_string = ''.join(byte_tuple)

    pixels = fromstring(byte_string, dtype='uint8').reshape((height * width, 4))

    def is_whitespace_pixel(pixel):
        return 0 if (pixel == WHITESPACE_PIXEL).all() else 1

    binary_bitmap = array([is_whitespace_pixel(p) for p in pixels]) # slow

    return binary_bitmap

def find_key_slides(doc):
    '''
    Returns the page numbers of slides who don't share all of their non-whitespace
    pixels with the slide in front of them. This allows transitional slides that
    incrementally introduce the elements of the "key" slide to be dropped.
    '''
    last_map, last_number = None, 0
    for current_number, page in get_pages(doc):
        current_map = get_binary_bitmap(page)
        if not last_map is None and not (current_map & last_map == last_map).all():
            yield last_number
        last_map, last_number = current_map, current_number

def draw_page(context, page):
    CGPDFContextBeginPage(context, None)
    CGContextDrawPDFPage(context, page)
    CGPDFContextEndPage(context)

def make_trimmed(path):
    '''
    Make a new PDF based on the one given by path, where only "key" slides as
    defined by find_key_slides are copied over.
    '''
    assert path.endswith('.pdf')
    new_path = path[0:-4] + '-trimmed.pdf'

    original = CGPDFDocumentCreateWithURL(file_url_for_path(path))
    key_slides = find_key_slides(original)

    bounds = get_crop_box(get_page(original, 1))
    context = CGPDFContextCreateWithURL(file_url_for_path(new_path), bounds, None)

    for number in key_slides:
        page = get_page(original, number)
        draw_page(context, page)

if __name__ == '__main__':
    if len(argv) == 2:
        make_trimmed(argv[1])
    else:
        print('usage: %s filename.pdf' % argv[0])
	#!/usr/bin/python
	#
	# Trims a PDF made from a Powerpoint presentation, removing slides that share all
	# non-whitespace pixels with the one in front of them. i.e. Only retains "key"
	# slides, dropping transitional ones whose content is redundant.
	#

	from sys import argv
	from numpy import array, fromstring
	from itertools import islice
	from CoreGraphics import *
	from Quartz import *

	COLORSPACE = CGColorSpaceCreateDeviceRGB()
	BYTES_PER_PIXEL = 4
	WHITESPACE_PIXEL = array((255, 255, 255, 255))

	def file_url_for_path(path):
	return CFURLCreateWithFileSystemPath(None, path, kCFURLPOSIXPathStyle, 0)

	def get_page(doc, number):
	return CGPDFDocumentGetPage(doc, number)

	def get_crop_box(page):
	return CGPDFPageGetBoxRect(page, kCGPDFCropBox)

	def get_pages(doc):
	count = CGPDFDocumentGetNumberOfPages(doc)
	for number in range(1, count):
	yield number, get_page(doc, number)

	def get_binary_bitmap(page):
	'''
	Return an binary array representing all pixels in the page, where zero represents
	a whitespace pixel and one represents a non-whitespace pixel.
	'''
	bounds = get_crop_box(page)

	width = int(bounds.size.width)
	height = int(bounds.size.height)

	context = CGBitmapContextCreate(None, width, height, 8, BYTES_PER_PIXEL * width,
	COLORSPACE, kCGImageAlphaNoneSkipLast)
	CGContextDrawPDFPage(context, page)

	total_bytes = width * height * BYTES_PER_PIXEL
	byte_tuple = CGBitmapContextGetData(context).as_tuple(total_bytes)
	byte_string = ''.join(byte_tuple)

	pixels = fromstring(byte_string, dtype='uint8').reshape((height * width, 4))

	def is_whitespace_pixel(pixel):
	return 0 if (pixel == WHITESPACE_PIXEL).all() else 1

	binary_bitmap = array([is_whitespace_pixel(p) for p in pixels]) # slow

	return binary_bitmap

	def find_key_slides(doc):
	'''
	Returns the page numbers of slides who don't share all of their non-whitespace
	pixels with the slide in front of them. This allows transitional slides that
	incrementally introduce the elements of the "key" slide to be dropped.
	'''
	last_map, last_number = None, 0
	for current_number, page in get_pages(doc):
	current_map = get_binary_bitmap(page)
	if not last_map is None and not (current_map & last_map == last_map).all():
	yield last_number
	last_map, last_number = current_map, current_number

	def draw_page(context, page):
	CGPDFContextBeginPage(context, None)
	CGContextDrawPDFPage(context, page)
	CGPDFContextEndPage(context)

	def make_trimmed(path):
	'''
	Make a new PDF based on the one given by path, where only "key" slides as
	defined by find_key_slides are copied over.
	'''
	assert path.endswith('.pdf')
	new_path = path[0:-4] + '-trimmed.pdf'

	original = CGPDFDocumentCreateWithURL(file_url_for_path(path))
	key_slides = find_key_slides(original)

	bounds = get_crop_box(get_page(original, 1))
	context = CGPDFContextCreateWithURL(file_url_for_path(new_path), bounds, None)

	for number in key_slides:
	page = get_page(original, number)
	draw_page(context, page)

	if __name__ == '__main__':
	if len(argv) == 2:
	make_trimmed(argv[1])
	else:
	print('usage: %s filename.pdf' % argv[0])