winhamwr/exporter.py

## exporter.py
import base64
import logging
import cgi
import logging
import os
import posixpath
import re
import subprocess
import time
from tempfile import NamedTemporaryFile
from urlparse import unquote, urlparse
from StringIO import StringIO

from PIL import Image

from pydocx.export import PyDocXHTMLExporter

from pstat.misc.get_image_from_src import get_image_from_src, is_encoded_image
from pstat.misc.replace_extension import replace_extension

IMAGE_EXTENSIONS_TO_SKIP = ['emf', 'wmf', 'svg']
IMAGE_FORMATS_TO_GIF_COMPRESS = ['BMP', 'TIFF']

logger = logging.getLogger('pstat.misc.image')


class PstatDocx2Html(PyDocXHTMLExporter):
        def image(
            self,
            image_data,
            filename,
            x,
            y,
            uri_is_external,
            *args,
            **kwargs):
            if uri_is_external:
                image_data, filename = get_image_data_and_filename(
                    image_data,
                    filename,
                )
            pstat_image = PstatImage(image_data, filename, x, y)
            if pstat_image.has_skipable_extension():
                return ''

            if not pstat_image.has_height_and_width():
                return ''

            pstat_image.prime_image()
            pstat_image.resize_image()
            pstat_image.update_filename()
            return super(
                PstatDocx2Html,
                self,
            ).image(
                pstat_image.image_data,
                pstat_image.filename,
                pstat_image.x,
                pstat_image.y,
                uri_is_external,
                *args, **kwargs)


class PstatImage(object):
    def __init__(self, image_data, filename, x, y):
        self.image_data = image_data
        self.filename = filename
        self.x = self._get_dimension(x)
        self.y = self._get_dimension(y)
        self.image_format = None
        self.image = None

    def has_skipable_extension(self):
        if not self.filename:
            return False
        lower_src = self.filename.lower()
        extension = lower_src.rsplit('.')[-1]
        return extension in IMAGE_EXTENSIONS_TO_SKIP

    def has_height_and_width(self):
        return self.x and self.y

    def _get_dimension(self, dim):
        if not dim:
            return 0
        try:
            return int(dim.strip('px'))
        except ValueError:
            logger.warning('Unable to convert size: "%s"', dim)
        return 0

    def prime_image(self):
        image_data = self.image_data
        match = is_encoded_image(image_data)
        if match:
            image_data = base64.b64decode(match.group('image_data'))
        try:
            self.image = Image.open(StringIO(image_data))
        except (IOError, SystemError):
            # PIL can't open it, return the image_data as is.
            logger.warning('Not able to open image')

    def resize_image(self):
        # Let's not resize a base64 encoded image.
        if is_encoded_image(self.image_data):
            return
        if not self.image:
            return
        image_format = self.image.format
        self.image_format = image_format
        expected_sizes = (self.x, self.y)

        current_area = self.x * self.y
        new_x, new_y = self.image.size
        new_area = new_x * new_y
        # We don't ever want to resize an image and it be larger than the
        # original. As such count the before and after pixels (area) and
        # compare.
        if (current_area < new_area) and (expected_sizes != self.image.size):
            try:
                self.image = self.image.resize(expected_sizes, Image.ANTIALIAS)
            except (IOError, SystemError):
                # Image can't be resized, such is life.
                logger.warning('Unable to resize')
        if image_format in IMAGE_FORMATS_TO_GIF_COMPRESS:
            # Convert to gif.
            image_format = 'GIF'
        output = StringIO()
        try:
            self.image.save(output, image_format)
            self.image_data = output.getvalue()
        except (IOError, SystemError):
            # PIL can't save this image.
            logger.warning('Unable to save image')
        self.image_format = image_format

    def update_filename(self):
        if not self.image_format:
            return
        if not self.filename:
            return
        self.filename = replace_extension(
            self.filename,
            self.image_format.lower(),
        )

def get_image_data_and_filename(image_data, filename):
    """
    If the image is an external image then the image_data is actually a link to
    the image and the filename is likely garbage.
    """
    parsed_url = urlparse(image_data)
    _, real_filename = posixpath.split(parsed_url.path)
    match = is_encoded_image(image_data)
    sanitized_filename = None
    if not match:
        sanitized_filename = sanitize_filename(real_filename)
    real_image_data = get_image_from_src(image_data)
    if real_image_data is None:
        return image_data, filename
    return real_image_data, sanitized_filename


def sanitize_filename(filename):
    """
    When we create attachments from pydocx we usually add a timestamp followed
    by a dash (-) to make the image unique for round-tripping. In an effort to
    prevent a bunch of timestamps preceding the image name (in the event a
    document is round-tripped several times), strip off the timestamp
    and dash. When images come from docx they are always `image\d+`. We only
    want to strip off the timestamp and dash if they were progromatically
    added.
    >>> sanitize_filename('1409764011-image1.gif')
    'image1.gif'
    >>> sanitize_filename('409764011-image1.gif')
    '409764011-image1.gif'
    >>> sanitize_filename('1409764011-image.gif')
    '1409764011-image.gif'
    >>> sanitize_filename('image%20%232014.gif')
    'image #2014.gif'
    """

    # (timestamp)-image(image_number).(file_extension)
    regex = re.compile(r'\d{10}-image\d+\.\w{3,4}')
    if regex.match(filename):
        _, filename = filename.rsplit('-', 1)
    return unquote(filename)

## misc.get_image_from_src.py
# coding: utf-8
import re

import requests
from requests.exceptions import InvalidSchema


data_uri_regex = re.compile(
    r'data:image/(?P<extension>\w+);base64,(?P<image_data>.+)',
)


def is_encoded_image(image_data):
    return data_uri_regex.match(image_data)


def get_image_from_src(src):
    '''
    Take a src attribute from an image tag and return the content image data
    associated with that image. At the minimum we should handle https:// and
    base64 encoded images.
    '''
    # Handle the easy case first, its an external link to somewhere else.
    try:
        response = requests.get(src)
    except InvalidSchema:
        pass
    else:
        return response.content

    # Check to see if it's a base64 encoded image.
    match = is_encoded_image(src)
    if match:
        return src

    # Not really sure what is going on here, punt for now.
    return src

## misc.replace_extension.py
# coding: utf-8
import os


def replace_extension(file_path, new_ext):
    """
    >>> replace_extension('one/two/three.four.doc', '.html')
    'one/two/three.four.html'
    >>> replace_extension('one/two/three.four.DOC', '.html')
    'one/two/three.four.html'
    >>> replace_extension('one/two/three.four.DOC', 'html')
    'one/two/three.four.html'
    """
    if not new_ext.startswith(os.extsep):
        new_ext = os.extsep + new_ext
    index = file_path.rfind(os.extsep)
    return file_path[:index] + new_ext
	import base64
	import logging
	import cgi
	import logging
	import os
	import posixpath
	import re
	import subprocess
	import time
	from tempfile import NamedTemporaryFile
	from urlparse import unquote, urlparse
	from StringIO import StringIO

	from PIL import Image

	from pydocx.export import PyDocXHTMLExporter

	from pstat.misc.get_image_from_src import get_image_from_src, is_encoded_image
	from pstat.misc.replace_extension import replace_extension

	IMAGE_EXTENSIONS_TO_SKIP = ['emf', 'wmf', 'svg']
	IMAGE_FORMATS_TO_GIF_COMPRESS = ['BMP', 'TIFF']

	logger = logging.getLogger('pstat.misc.image')


	class PstatDocx2Html(PyDocXHTMLExporter):
	def image(
	self,
	image_data,
	filename,
	x,
	y,
	uri_is_external,
	*args,
	**kwargs):
	if uri_is_external:
	image_data, filename = get_image_data_and_filename(
	image_data,
	filename,
	)
	pstat_image = PstatImage(image_data, filename, x, y)
	if pstat_image.has_skipable_extension():
	return ''

	if not pstat_image.has_height_and_width():
	return ''

	pstat_image.prime_image()
	pstat_image.resize_image()
	pstat_image.update_filename()
	return super(
	PstatDocx2Html,
	self,
	).image(
	pstat_image.image_data,
	pstat_image.filename,
	pstat_image.x,
	pstat_image.y,
	uri_is_external,
	args, *kwargs)


	class PstatImage(object):
	def __init__(self, image_data, filename, x, y):
	self.image_data = image_data
	self.filename = filename
	self.x = self._get_dimension(x)
	self.y = self._get_dimension(y)
	self.image_format = None
	self.image = None

	def has_skipable_extension(self):
	if not self.filename:
	return False
	lower_src = self.filename.lower()
	extension = lower_src.rsplit('.')[-1]
	return extension in IMAGE_EXTENSIONS_TO_SKIP

	def has_height_and_width(self):
	return self.x and self.y

	def _get_dimension(self, dim):
	if not dim:
	return 0
	try:
	return int(dim.strip('px'))
	except ValueError:
	logger.warning('Unable to convert size: "%s"', dim)
	return 0

	def prime_image(self):
	image_data = self.image_data
	match = is_encoded_image(image_data)
	if match:
	image_data = base64.b64decode(match.group('image_data'))
	try:
	self.image = Image.open(StringIO(image_data))
	except (IOError, SystemError):
	# PIL can't open it, return the image_data as is.
	logger.warning('Not able to open image')

	def resize_image(self):
	# Let's not resize a base64 encoded image.
	if is_encoded_image(self.image_data):
	return
	if not self.image:
	return
	image_format = self.image.format
	self.image_format = image_format
	expected_sizes = (self.x, self.y)

	current_area = self.x * self.y
	new_x, new_y = self.image.size
	new_area = new_x * new_y
	# We don't ever want to resize an image and it be larger than the
	# original. As such count the before and after pixels (area) and
	# compare.
	if (current_area < new_area) and (expected_sizes != self.image.size):
	try:
	self.image = self.image.resize(expected_sizes, Image.ANTIALIAS)
	except (IOError, SystemError):
	# Image can't be resized, such is life.
	logger.warning('Unable to resize')
	if image_format in IMAGE_FORMATS_TO_GIF_COMPRESS:
	# Convert to gif.
	image_format = 'GIF'
	output = StringIO()
	try:
	self.image.save(output, image_format)
	self.image_data = output.getvalue()
	except (IOError, SystemError):
	# PIL can't save this image.
	logger.warning('Unable to save image')
	self.image_format = image_format

	def update_filename(self):
	if not self.image_format:
	return
	if not self.filename:
	return
	self.filename = replace_extension(
	self.filename,
	self.image_format.lower(),
	)

	def get_image_data_and_filename(image_data, filename):
	"""
	If the image is an external image then the image_data is actually a link to
	the image and the filename is likely garbage.
	"""
	parsed_url = urlparse(image_data)
	_, real_filename = posixpath.split(parsed_url.path)
	match = is_encoded_image(image_data)
	sanitized_filename = None
	if not match:
	sanitized_filename = sanitize_filename(real_filename)
	real_image_data = get_image_from_src(image_data)
	if real_image_data is None:
	return image_data, filename
	return real_image_data, sanitized_filename


	def sanitize_filename(filename):
	"""
	When we create attachments from pydocx we usually add a timestamp followed
	by a dash (-) to make the image unique for round-tripping. In an effort to
	prevent a bunch of timestamps preceding the image name (in the event a
	document is round-tripped several times), strip off the timestamp
	and dash. When images come from docx they are always `image\d+`. We only
	want to strip off the timestamp and dash if they were progromatically
	added.
	>>> sanitize_filename('1409764011-image1.gif')
	'image1.gif'
	>>> sanitize_filename('409764011-image1.gif')
	'409764011-image1.gif'
	>>> sanitize_filename('1409764011-image.gif')
	'1409764011-image.gif'
	>>> sanitize_filename('image%20%232014.gif')
	'image #2014.gif'
	"""

	# (timestamp)-image(image_number).(file_extension)
	regex = re.compile(r'\d{10}-image\d+\.\w{3,4}')
	if regex.match(filename):
	_, filename = filename.rsplit('-', 1)
	return unquote(filename)
	# coding: utf-8
	import re

	import requests
	from requests.exceptions import InvalidSchema


	data_uri_regex = re.compile(
	r'data:image/(?P<extension>\w+);base64,(?P<image_data>.+)',
	)


	def is_encoded_image(image_data):
	return data_uri_regex.match(image_data)


	def get_image_from_src(src):
	'''
	Take a src attribute from an image tag and return the content image data
	associated with that image. At the minimum we should handle https:// and
	base64 encoded images.
	'''
	# Handle the easy case first, its an external link to somewhere else.
	try:
	response = requests.get(src)
	except InvalidSchema:
	pass
	else:
	return response.content

	# Check to see if it's a base64 encoded image.
	match = is_encoded_image(src)
	if match:
	return src

	# Not really sure what is going on here, punt for now.
	return src
	# coding: utf-8
	import os


	def replace_extension(file_path, new_ext):
	"""
	>>> replace_extension('one/two/three.four.doc', '.html')
	'one/two/three.four.html'
	>>> replace_extension('one/two/three.four.DOC', '.html')
	'one/two/three.four.html'
	>>> replace_extension('one/two/three.four.DOC', 'html')
	'one/two/three.four.html'
	"""
	if not new_ext.startswith(os.extsep):
	new_ext = os.extsep + new_ext
	index = file_path.rfind(os.extsep)
	return file_path[:index] + new_ext