Created
December 2, 2012 08:15
-
-
Save dacc/4187694 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# Trims a PDF made from a Powerpoint presentation, removing slides that share all | |
# non-whitespace pixels with the one in front of them. i.e. Only retains "key" | |
# slides, dropping transitional ones whose content is redundant. | |
# | |
from sys import argv | |
from numpy import array, fromstring | |
from itertools import islice | |
from CoreGraphics import * | |
from Quartz import * | |
COLORSPACE = CGColorSpaceCreateDeviceRGB() | |
BYTES_PER_PIXEL = 4 | |
WHITESPACE_PIXEL = array((255, 255, 255, 255)) | |
def file_url_for_path(path): | |
return CFURLCreateWithFileSystemPath(None, path, kCFURLPOSIXPathStyle, 0) | |
def get_page(doc, number): | |
return CGPDFDocumentGetPage(doc, number) | |
def get_crop_box(page): | |
return CGPDFPageGetBoxRect(page, kCGPDFCropBox) | |
def get_pages(doc): | |
count = CGPDFDocumentGetNumberOfPages(doc) | |
for number in range(1, count): | |
yield number, get_page(doc, number) | |
def get_binary_bitmap(page): | |
''' | |
Return an binary array representing all pixels in the page, where zero represents | |
a whitespace pixel and one represents a non-whitespace pixel. | |
''' | |
bounds = get_crop_box(page) | |
width = int(bounds.size.width) | |
height = int(bounds.size.height) | |
context = CGBitmapContextCreate(None, width, height, 8, BYTES_PER_PIXEL * width, | |
COLORSPACE, kCGImageAlphaNoneSkipLast) | |
CGContextDrawPDFPage(context, page) | |
total_bytes = width * height * BYTES_PER_PIXEL | |
byte_tuple = CGBitmapContextGetData(context).as_tuple(total_bytes) | |
byte_string = ''.join(byte_tuple) | |
pixels = fromstring(byte_string, dtype='uint8').reshape((height * width, 4)) | |
def is_whitespace_pixel(pixel): | |
return 0 if (pixel == WHITESPACE_PIXEL).all() else 1 | |
binary_bitmap = array([is_whitespace_pixel(p) for p in pixels]) # slow | |
return binary_bitmap | |
def find_key_slides(doc): | |
''' | |
Returns the page numbers of slides who don't share all of their non-whitespace | |
pixels with the slide in front of them. This allows transitional slides that | |
incrementally introduce the elements of the "key" slide to be dropped. | |
''' | |
last_map, last_number = None, 0 | |
for current_number, page in get_pages(doc): | |
current_map = get_binary_bitmap(page) | |
if not last_map is None and not (current_map & last_map == last_map).all(): | |
yield last_number | |
last_map, last_number = current_map, current_number | |
def draw_page(context, page): | |
CGPDFContextBeginPage(context, None) | |
CGContextDrawPDFPage(context, page) | |
CGPDFContextEndPage(context) | |
def make_trimmed(path): | |
''' | |
Make a new PDF based on the one given by path, where only "key" slides as | |
defined by find_key_slides are copied over. | |
''' | |
assert path.endswith('.pdf') | |
new_path = path[0:-4] + '-trimmed.pdf' | |
original = CGPDFDocumentCreateWithURL(file_url_for_path(path)) | |
key_slides = find_key_slides(original) | |
bounds = get_crop_box(get_page(original, 1)) | |
context = CGPDFContextCreateWithURL(file_url_for_path(new_path), bounds, None) | |
for number in key_slides: | |
page = get_page(original, number) | |
draw_page(context, page) | |
if __name__ == '__main__': | |
if len(argv) == 2: | |
make_trimmed(argv[1]) | |
else: | |
print('usage: %s filename.pdf' % argv[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment