Skip to content

Instantly share code, notes, and snippets.

@dacc
Created December 2, 2012 08:15
Show Gist options
  • Save dacc/4187694 to your computer and use it in GitHub Desktop.
Save dacc/4187694 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
#
# Trims a PDF made from a Powerpoint presentation, removing slides that share all
# non-whitespace pixels with the one in front of them. i.e. Only retains "key"
# slides, dropping transitional ones whose content is redundant.
#
from sys import argv
from numpy import array, fromstring
from itertools import islice
from CoreGraphics import *
from Quartz import *
COLORSPACE = CGColorSpaceCreateDeviceRGB()
BYTES_PER_PIXEL = 4
WHITESPACE_PIXEL = array((255, 255, 255, 255))
def file_url_for_path(path):
return CFURLCreateWithFileSystemPath(None, path, kCFURLPOSIXPathStyle, 0)
def get_page(doc, number):
return CGPDFDocumentGetPage(doc, number)
def get_crop_box(page):
return CGPDFPageGetBoxRect(page, kCGPDFCropBox)
def get_pages(doc):
count = CGPDFDocumentGetNumberOfPages(doc)
for number in range(1, count):
yield number, get_page(doc, number)
def get_binary_bitmap(page):
'''
Return an binary array representing all pixels in the page, where zero represents
a whitespace pixel and one represents a non-whitespace pixel.
'''
bounds = get_crop_box(page)
width = int(bounds.size.width)
height = int(bounds.size.height)
context = CGBitmapContextCreate(None, width, height, 8, BYTES_PER_PIXEL * width,
COLORSPACE, kCGImageAlphaNoneSkipLast)
CGContextDrawPDFPage(context, page)
total_bytes = width * height * BYTES_PER_PIXEL
byte_tuple = CGBitmapContextGetData(context).as_tuple(total_bytes)
byte_string = ''.join(byte_tuple)
pixels = fromstring(byte_string, dtype='uint8').reshape((height * width, 4))
def is_whitespace_pixel(pixel):
return 0 if (pixel == WHITESPACE_PIXEL).all() else 1
binary_bitmap = array([is_whitespace_pixel(p) for p in pixels]) # slow
return binary_bitmap
def find_key_slides(doc):
'''
Returns the page numbers of slides who don't share all of their non-whitespace
pixels with the slide in front of them. This allows transitional slides that
incrementally introduce the elements of the "key" slide to be dropped.
'''
last_map, last_number = None, 0
for current_number, page in get_pages(doc):
current_map = get_binary_bitmap(page)
if not last_map is None and not (current_map & last_map == last_map).all():
yield last_number
last_map, last_number = current_map, current_number
def draw_page(context, page):
CGPDFContextBeginPage(context, None)
CGContextDrawPDFPage(context, page)
CGPDFContextEndPage(context)
def make_trimmed(path):
'''
Make a new PDF based on the one given by path, where only "key" slides as
defined by find_key_slides are copied over.
'''
assert path.endswith('.pdf')
new_path = path[0:-4] + '-trimmed.pdf'
original = CGPDFDocumentCreateWithURL(file_url_for_path(path))
key_slides = find_key_slides(original)
bounds = get_crop_box(get_page(original, 1))
context = CGPDFContextCreateWithURL(file_url_for_path(new_path), bounds, None)
for number in key_slides:
page = get_page(original, number)
draw_page(context, page)
if __name__ == '__main__':
if len(argv) == 2:
make_trimmed(argv[1])
else:
print('usage: %s filename.pdf' % argv[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment