Created
October 9, 2010 00:37
-
-
Save bartek/617756 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
PDF To Image Converter Class. | |
The base of this script was derived from the pdfpeek package by David Brenneman: | |
http://pypi.python.org/pypi/collective.pdfpeek | |
Simply wanted something in a single file. Could use a bit of adjustments in | |
how files are saved, but this was mostly playing around to see how previewing PDF files works. | |
Requires pyPDF and PIL to do the essential reading. | |
Usage: | |
pdfimage = ConvertToImage() | |
pdf = open('test.pdf').read() | |
images = pdfimage.generate_images(pdf) | |
`images` will contain a tuple list of the raw image data as well as the location | |
of the file as a String for each page generated. | |
""" | |
import os | |
import subprocess | |
import StringIO | |
import logging | |
import pyPdf | |
from PIL import Image | |
logger = logging.getLogger('pdfConvertor') | |
class ConvertToImage(object): | |
def __init__(self, | |
base_path='.', | |
quality='99', | |
graphicsAlphaBits='4', | |
textAlphaBits='4'): | |
""" | |
`base_path` is the directory in which the images will be saved. Current by default | |
`quality` should generally be pretty high, or you will get a very artifacted image. | |
`graphicsAlphaBits` and `textAlphaBits` control the oversampling. Leave this as 4 unless | |
you know what you're doing. | |
""" | |
self.base_path = base_path | |
self.quality = quality | |
self.graphicsAlphaBits = graphicsAlphaBits | |
self.textAlphaBits = textAlphaBits | |
def _options_to_list(self, page_number): | |
first_page = page_number | |
last_page = page_number | |
return ["gs", | |
"-q", | |
"-sDEVICE=jpeg", | |
"-dJPEGQ=%s" % self.quality, | |
"-dGraphicsAlphaBits=%s" % self.graphicsAlphaBits, | |
"-dTextAlphaBits=%s" % self.textAlphaBits | |
] + \ | |
[ | |
"-dFirstPage=%s" % first_page, | |
"-dLastPage=%s" % last_page, | |
"-sOutputFile=%stdout", | |
"-", | |
] | |
def transform(self, pdf_data_string, page_number): | |
image = None | |
gs_cmd = self._options_to_list(page_number) | |
gs_process = subprocess.Popen(gs_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE) | |
gs_process.stdin.write(pdf_data_string) | |
image = gs_process.communicate()[0] | |
gs_process.stdin.close() | |
return_code = gs_process.returncode | |
if return_code > 0: | |
logger.warn("Ghostscript process didn't quite work! Error Code: %s" % return_code) | |
return image | |
def generate_images(self, pdf_data_string): | |
page_count = 0 | |
page_number = 0 | |
images = [] | |
pdf = None | |
pdf_data_string = str(pdf_data_string) | |
# Need to catch expected errors (not a PDF) here. | |
try: | |
pdf = pyPdf.PdfFileReader(StringIO.StringIO(pdf_data_string)) | |
except: | |
logger.warn("Error opening PDF file.") | |
if pdf.isEncrypted: | |
try: | |
decrypt = pdf.decrypt("") | |
if decrypt == 0: | |
logger.warn("This PDF is password protected.") | |
except NotImplementedError: | |
logger.warn("Document uses an unsupported encryption method.") | |
if pdf: | |
page_count = pdf.getNumPages() | |
if page_count > 0: | |
for page in range(page_count): | |
# Humans don't understand a page number that's zero, well not most anyways. | |
page_number = page + 1 | |
image_raw = StringIO.StringIO('') | |
raw_image = self.transform(pdf_data_string, page_number) | |
output_file = os.path.join(self.base_path, "%s.jpg" % page_number) | |
# Use PIL to generate a jpeg from the raw data | |
image_thumb = Image.open(StringIO.StringIO(raw_image)) | |
image_thumb.save(output_file, "JPEG") | |
images.append( (image_thumb, output_file) ) | |
return images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment