Skip to content

Instantly share code, notes, and snippets.

@tisto
Last active August 29, 2015 14:21
Show Gist options
  • Save tisto/e5685c92b85dcf71cd81 to your computer and use it in GitHub Desktop.
Save tisto/e5685c92b85dcf71cd81 to your computer and use it in GitHub Desktop.
html_to_xxx transform & test
# -*- coding: utf-8 -*-
import os
import re
from lxml import etree
from Products.PortalTransforms.interfaces import ITransform
from zope.interface import implements
class HtmlToXXXComment:
implements(ITransform)
__name__ = "html_to_xxx_comment"
output = "text/html-xxx-comment"
def __init__(self, name=None, inputs=('text/html',), tab_width=4):
self.config = {'inputs': inputs, 'tab_width': 4}
self.config_metadata = {
'inputs':
('list',
'Inputs',
'Input(s) MIME type. Change with care.'),
'tab_width':
('string',
'Tab width',
'Number of spaces for a tab in the input'),
}
if name:
self.__name__ = name
def name(self):
return self.__name__
def __getattr__(self, attr):
if attr in self.config:
return self.config[attr]
raise AttributeError(attr)
def convert(self, html, **kwargs):
# make the XSL stylesheet available
cleanup_html_xslt = os.path.abspath(
os.path.join(os.path.dirname(__file__),
'html_to_xxx_comment.xsl'))
# XSLT transformations always expect a root object
html = "<html>%s</html>" % html
# xml -> elementtree
# try to recover if not valid html, remove blanks between two tags
parser = etree.XMLParser(recover=True, remove_blank_text=True)
htmltree = etree.XML(html, parser)
# parse the xslt stylesheet
styletree = etree.parse(cleanup_html_xslt)
# load the xslt stylesheet
transform = etree.XSLT(styletree)
# transform the html tree
resulttree = transform(htmltree)
# strip whitespace inside two tags, e.g. <p> lorem ipsum </p> =>
# <p>lorem ipsum</p>
for element in resulttree.iter("*"):
if element.text is not None and not element.text.strip():
element.text = None
cleaned_html = etree.tostring(resulttree)
if cleaned_html:
# replace the html node
p = re.compile(r'<.?html?.>')
cleaned_html = p.sub('', cleaned_html)
return cleaned_html
def register():
return HtmlToXXXComment()
# -*- coding: utf-8 -*-
import unittest2 as unittest
class HtmlToxxxCommentUnitTest(unittest.TestCase):
def setUp(self):
self.transform = self._makeOne()
def _getTargetClass(self):
from xxx.util.transform.html_to_xxx_comment import \
HtmlToxxxComment
return HtmlToxxxComment
def _makeOne(self, *args, **kw):
return self._getTargetClass()(*args, **kw)
def test_empty(self):
html = ""
self.assertEqual(self.transform.convert(html), "")
# Valid Elements
def test_keep_h2_headlines(self):
html = "<h2>Headlines are good!</h2>"
self.assertEqual(
self.transform.convert(html),
"<h2>Headlines are good!</h2>")
def test_keep_paragraphs(self):
html = "<p>paragraphs are good!</p>"
self.assertEqual(
self.transform.convert(html),
"<p>paragraphs are good!</p>")
def test_keep_spans(self):
html = "<span>spans are good!</span>"
self.assertEqual(
self.transform.convert(html),
"<span>spans are good!</span>")
def test_keep_span_style_attribue(self):
html = '<span style="color: red;">spans are good!</span>'
self.assertEqual(
self.transform.convert(html),
'<span style="color: red;">spans are good!</span>')
def test_keep_italic(self):
html = "<i>italic is nice!</i>"
self.assertEqual(
self.transform.convert(html),
"<i>italic is nice!</i>")
def test_keep_strong(self):
html = "<strong>strong is good!</strong>"
self.assertEqual(
self.transform.convert(html),
"<strong>strong is good!</strong>")
def test_keep_bold(self):
html = "<b>boldness is also good</b>"
self.assertEqual(
self.transform.convert(html),
"<b>boldness is also good</b>")
def test_keep_emphasis(self):
html = "<em>emphasis is good!</em>"
self.assertEqual(
self.transform.convert(html),
"<em>emphasis is good!</em>")
def test_keep_sup(self):
html = "<sup>sup is good!</sup>"
self.assertEqual(
self.transform.convert(html),
"<sup>sup is good!</sup>")
def test_keep_sub(self):
html = "<sub>sub is good!</sub>"
self.assertEqual(
self.transform.convert(html),
"<sub>sub is good!</sub>")
def test_keep_links(self):
html = '<a href="http://www.xxx.de">xxx rules!</a>'
self.assertEqual(
self.transform.convert(html),
'<a href="http://www.xxx.de">xxx rules!</a>')
def test_keep_iframes(self):
html = '<iframe src="http://www.xxx.de">xxx IFrame</iframe>'
self.assertEqual(
self.transform.convert(html),
'<iframe src="http://www.xxx.de">xxx IFrame</iframe>'
)
def test_keep_images(self):
html = '<img src="logo.png"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png"/>'
)
def test_keep_images_with_alt_attribute(self):
html = '<img src="logo.png" alt="logo"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" alt="logo"/>'
)
def test_keep_images_with_logo_attribute(self):
html = '<img src="logo.png" alt="logo"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" alt="logo"/>'
)
def test_keep_images_with_title_attribute(self):
html = '<img src="logo.png" title="logo"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" title="logo"/>'
)
def test_keep_images_with_width_attribute(self):
html = '<img src="logo.png" width="100"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" width="100"/>'
)
def test_keep_images_with_height_attribute(self):
html = '<img src="logo.png" height="100"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" height="100"/>'
)
def test_keep_images_with_align_attribute(self):
html = '<img src="logo.png" align="center"/>'
self.assertEqual(
self.transform.convert(html),
'<img src="logo.png" align="center"/>'
)
# Ignores
def test_ignore_headlines_other_than_h1_or_h2(self):
html = '<h3>foo</h3><h4>bar</h4><h5>baz</h5><h6>boz</h6><p>Keep me</p>'
self.assertEqual(
self.transform.convert(html),
'<p>foo</p><p>bar</p><p>baz</p><p>boz</p><p>Keep me</p>')
def test_ignore_top_level_elements(self):
html = '<html><p>Keep me</p></html>'
self.assertEqual(
self.transform.convert(html),
'<p>Keep me</p>')
def test_multiple_elements(self):
html = "<p>foo</p><p>bar</p>"
self.assertEqual(
self.transform.convert(html),
"<p>foo</p><p>bar</p>")
def test_transform_brackets(self):
html = "<p>>>>foo<<<</p>"
self.assertEqual(
self.transform.convert(html),
"<p>&gt;&gt;&gt;foo</p>")
# XXX: "<" should be transformed into "&lt;" and not ignored. Since
# these are filtered by XSLT this is hard to debug/fix.
# Nested
def test_nested_elements(self):
html = "<div>foo<p>Keep me</p></div>"
self.assertEqual(
self.transform.convert(html),
'<p>foo<p>Keep me</p></p>')
def test_strip_outer_whitespace(self):
html = ' <p>Keep me</p> '
self.assertEqual(
self.transform.convert(html),
'<p>Keep me</p>')
def test_html_single(self):
html = "<html/>"
self.assertEqual(self.transform.convert(html), "")
def test_html_single_with_blank(self):
html = "<html />"
self.assertEqual(self.transform.convert(html), "")
def test_html_full_tag(self):
html = "<html></html>"
self.assertEqual(self.transform.convert(html), "")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment