Skip to content

Instantly share code, notes, and snippets.

@winhamwr
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save winhamwr/9dbf16a5a73759e35c43 to your computer and use it in GitHub Desktop.
Save winhamwr/9dbf16a5a73759e35c43 to your computer and use it in GitHub Desktop.
pydocx image handler for resizing images to their displayed size
import base64
import logging
import cgi
import logging
import os
import posixpath
import re
import subprocess
import time
from tempfile import NamedTemporaryFile
from urlparse import unquote, urlparse
from StringIO import StringIO
from PIL import Image
from pydocx.export import PyDocXHTMLExporter
from pstat.misc.get_image_from_src import get_image_from_src, is_encoded_image
from pstat.misc.replace_extension import replace_extension
IMAGE_EXTENSIONS_TO_SKIP = ['emf', 'wmf', 'svg']
IMAGE_FORMATS_TO_GIF_COMPRESS = ['BMP', 'TIFF']
logger = logging.getLogger('pstat.misc.image')
class PstatDocx2Html(PyDocXHTMLExporter):
def image(
self,
image_data,
filename,
x,
y,
uri_is_external,
*args,
**kwargs):
if uri_is_external:
image_data, filename = get_image_data_and_filename(
image_data,
filename,
)
pstat_image = PstatImage(image_data, filename, x, y)
if pstat_image.has_skipable_extension():
return ''
if not pstat_image.has_height_and_width():
return ''
pstat_image.prime_image()
pstat_image.resize_image()
pstat_image.update_filename()
return super(
PstatDocx2Html,
self,
).image(
pstat_image.image_data,
pstat_image.filename,
pstat_image.x,
pstat_image.y,
uri_is_external,
*args, **kwargs)
class PstatImage(object):
def __init__(self, image_data, filename, x, y):
self.image_data = image_data
self.filename = filename
self.x = self._get_dimension(x)
self.y = self._get_dimension(y)
self.image_format = None
self.image = None
def has_skipable_extension(self):
if not self.filename:
return False
lower_src = self.filename.lower()
extension = lower_src.rsplit('.')[-1]
return extension in IMAGE_EXTENSIONS_TO_SKIP
def has_height_and_width(self):
return self.x and self.y
def _get_dimension(self, dim):
if not dim:
return 0
try:
return int(dim.strip('px'))
except ValueError:
logger.warning('Unable to convert size: "%s"', dim)
return 0
def prime_image(self):
image_data = self.image_data
match = is_encoded_image(image_data)
if match:
image_data = base64.b64decode(match.group('image_data'))
try:
self.image = Image.open(StringIO(image_data))
except (IOError, SystemError):
# PIL can't open it, return the image_data as is.
logger.warning('Not able to open image')
def resize_image(self):
# Let's not resize a base64 encoded image.
if is_encoded_image(self.image_data):
return
if not self.image:
return
image_format = self.image.format
self.image_format = image_format
expected_sizes = (self.x, self.y)
current_area = self.x * self.y
new_x, new_y = self.image.size
new_area = new_x * new_y
# We don't ever want to resize an image and it be larger than the
# original. As such count the before and after pixels (area) and
# compare.
if (current_area < new_area) and (expected_sizes != self.image.size):
try:
self.image = self.image.resize(expected_sizes, Image.ANTIALIAS)
except (IOError, SystemError):
# Image can't be resized, such is life.
logger.warning('Unable to resize')
if image_format in IMAGE_FORMATS_TO_GIF_COMPRESS:
# Convert to gif.
image_format = 'GIF'
output = StringIO()
try:
self.image.save(output, image_format)
self.image_data = output.getvalue()
except (IOError, SystemError):
# PIL can't save this image.
logger.warning('Unable to save image')
self.image_format = image_format
def update_filename(self):
if not self.image_format:
return
if not self.filename:
return
self.filename = replace_extension(
self.filename,
self.image_format.lower(),
)
def get_image_data_and_filename(image_data, filename):
"""
If the image is an external image then the image_data is actually a link to
the image and the filename is likely garbage.
"""
parsed_url = urlparse(image_data)
_, real_filename = posixpath.split(parsed_url.path)
match = is_encoded_image(image_data)
sanitized_filename = None
if not match:
sanitized_filename = sanitize_filename(real_filename)
real_image_data = get_image_from_src(image_data)
if real_image_data is None:
return image_data, filename
return real_image_data, sanitized_filename
def sanitize_filename(filename):
"""
When we create attachments from pydocx we usually add a timestamp followed
by a dash (-) to make the image unique for round-tripping. In an effort to
prevent a bunch of timestamps preceding the image name (in the event a
document is round-tripped several times), strip off the timestamp
and dash. When images come from docx they are always `image\d+`. We only
want to strip off the timestamp and dash if they were progromatically
added.
>>> sanitize_filename('1409764011-image1.gif')
'image1.gif'
>>> sanitize_filename('409764011-image1.gif')
'409764011-image1.gif'
>>> sanitize_filename('1409764011-image.gif')
'1409764011-image.gif'
>>> sanitize_filename('image%20%232014.gif')
'image #2014.gif'
"""
# (timestamp)-image(image_number).(file_extension)
regex = re.compile(r'\d{10}-image\d+\.\w{3,4}')
if regex.match(filename):
_, filename = filename.rsplit('-', 1)
return unquote(filename)
# coding: utf-8
import re
import requests
from requests.exceptions import InvalidSchema
data_uri_regex = re.compile(
r'data:image/(?P<extension>\w+);base64,(?P<image_data>.+)',
)
def is_encoded_image(image_data):
return data_uri_regex.match(image_data)
def get_image_from_src(src):
'''
Take a src attribute from an image tag and return the content image data
associated with that image. At the minimum we should handle https:// and
base64 encoded images.
'''
# Handle the easy case first, its an external link to somewhere else.
try:
response = requests.get(src)
except InvalidSchema:
pass
else:
return response.content
# Check to see if it's a base64 encoded image.
match = is_encoded_image(src)
if match:
return src
# Not really sure what is going on here, punt for now.
return src
# coding: utf-8
import os
def replace_extension(file_path, new_ext):
"""
>>> replace_extension('one/two/three.four.doc', '.html')
'one/two/three.four.html'
>>> replace_extension('one/two/three.four.DOC', '.html')
'one/two/three.four.html'
>>> replace_extension('one/two/three.four.DOC', 'html')
'one/two/three.four.html'
"""
if not new_ext.startswith(os.extsep):
new_ext = os.extsep + new_ext
index = file_path.rfind(os.extsep)
return file_path[:index] + new_ext
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment