Last active
August 29, 2015 14:21
-
-
Save tisto/e5685c92b85dcf71cd81 to your computer and use it in GitHub Desktop.
html_to_xxx transform & test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import re | |
from lxml import etree | |
from Products.PortalTransforms.interfaces import ITransform | |
from zope.interface import implements | |
class HtmlToXXXComment: | |
implements(ITransform) | |
__name__ = "html_to_xxx_comment" | |
output = "text/html-xxx-comment" | |
def __init__(self, name=None, inputs=('text/html',), tab_width=4): | |
self.config = {'inputs': inputs, 'tab_width': 4} | |
self.config_metadata = { | |
'inputs': | |
('list', | |
'Inputs', | |
'Input(s) MIME type. Change with care.'), | |
'tab_width': | |
('string', | |
'Tab width', | |
'Number of spaces for a tab in the input'), | |
} | |
if name: | |
self.__name__ = name | |
def name(self): | |
return self.__name__ | |
def __getattr__(self, attr): | |
if attr in self.config: | |
return self.config[attr] | |
raise AttributeError(attr) | |
def convert(self, html, **kwargs): | |
# make the XSL stylesheet available | |
cleanup_html_xslt = os.path.abspath( | |
os.path.join(os.path.dirname(__file__), | |
'html_to_xxx_comment.xsl')) | |
# XSLT transformations always expect a root object | |
html = "<html>%s</html>" % html | |
# xml -> elementtree | |
# try to recover if not valid html, remove blanks between two tags | |
parser = etree.XMLParser(recover=True, remove_blank_text=True) | |
htmltree = etree.XML(html, parser) | |
# parse the xslt stylesheet | |
styletree = etree.parse(cleanup_html_xslt) | |
# load the xslt stylesheet | |
transform = etree.XSLT(styletree) | |
# transform the html tree | |
resulttree = transform(htmltree) | |
# strip whitespace inside two tags, e.g. <p> lorem ipsum </p> => | |
# <p>lorem ipsum</p> | |
for element in resulttree.iter("*"): | |
if element.text is not None and not element.text.strip(): | |
element.text = None | |
cleaned_html = etree.tostring(resulttree) | |
if cleaned_html: | |
# replace the html node | |
p = re.compile(r'<.?html?.>') | |
cleaned_html = p.sub('', cleaned_html) | |
return cleaned_html | |
def register(): | |
return HtmlToXXXComment() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import unittest2 as unittest | |
class HtmlToxxxCommentUnitTest(unittest.TestCase): | |
def setUp(self): | |
self.transform = self._makeOne() | |
def _getTargetClass(self): | |
from xxx.util.transform.html_to_xxx_comment import \ | |
HtmlToxxxComment | |
return HtmlToxxxComment | |
def _makeOne(self, *args, **kw): | |
return self._getTargetClass()(*args, **kw) | |
def test_empty(self): | |
html = "" | |
self.assertEqual(self.transform.convert(html), "") | |
# Valid Elements | |
def test_keep_h2_headlines(self): | |
html = "<h2>Headlines are good!</h2>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<h2>Headlines are good!</h2>") | |
def test_keep_paragraphs(self): | |
html = "<p>paragraphs are good!</p>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<p>paragraphs are good!</p>") | |
def test_keep_spans(self): | |
html = "<span>spans are good!</span>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<span>spans are good!</span>") | |
def test_keep_span_style_attribue(self): | |
html = '<span style="color: red;">spans are good!</span>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<span style="color: red;">spans are good!</span>') | |
def test_keep_italic(self): | |
html = "<i>italic is nice!</i>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<i>italic is nice!</i>") | |
def test_keep_strong(self): | |
html = "<strong>strong is good!</strong>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<strong>strong is good!</strong>") | |
def test_keep_bold(self): | |
html = "<b>boldness is also good</b>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<b>boldness is also good</b>") | |
def test_keep_emphasis(self): | |
html = "<em>emphasis is good!</em>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<em>emphasis is good!</em>") | |
def test_keep_sup(self): | |
html = "<sup>sup is good!</sup>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<sup>sup is good!</sup>") | |
def test_keep_sub(self): | |
html = "<sub>sub is good!</sub>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<sub>sub is good!</sub>") | |
def test_keep_links(self): | |
html = '<a href="http://www.xxx.de">xxx rules!</a>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<a href="http://www.xxx.de">xxx rules!</a>') | |
def test_keep_iframes(self): | |
html = '<iframe src="http://www.xxx.de">xxx IFrame</iframe>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<iframe src="http://www.xxx.de">xxx IFrame</iframe>' | |
) | |
def test_keep_images(self): | |
html = '<img src="logo.png"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png"/>' | |
) | |
def test_keep_images_with_alt_attribute(self): | |
html = '<img src="logo.png" alt="logo"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" alt="logo"/>' | |
) | |
def test_keep_images_with_logo_attribute(self): | |
html = '<img src="logo.png" alt="logo"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" alt="logo"/>' | |
) | |
def test_keep_images_with_title_attribute(self): | |
html = '<img src="logo.png" title="logo"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" title="logo"/>' | |
) | |
def test_keep_images_with_width_attribute(self): | |
html = '<img src="logo.png" width="100"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" width="100"/>' | |
) | |
def test_keep_images_with_height_attribute(self): | |
html = '<img src="logo.png" height="100"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" height="100"/>' | |
) | |
def test_keep_images_with_align_attribute(self): | |
html = '<img src="logo.png" align="center"/>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<img src="logo.png" align="center"/>' | |
) | |
# Ignores | |
def test_ignore_headlines_other_than_h1_or_h2(self): | |
html = '<h3>foo</h3><h4>bar</h4><h5>baz</h5><h6>boz</h6><p>Keep me</p>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<p>foo</p><p>bar</p><p>baz</p><p>boz</p><p>Keep me</p>') | |
def test_ignore_top_level_elements(self): | |
html = '<html><p>Keep me</p></html>' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<p>Keep me</p>') | |
def test_multiple_elements(self): | |
html = "<p>foo</p><p>bar</p>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<p>foo</p><p>bar</p>") | |
def test_transform_brackets(self): | |
html = "<p>>>>foo<<<</p>" | |
self.assertEqual( | |
self.transform.convert(html), | |
"<p>>>>foo</p>") | |
# XXX: "<" should be transformed into "<" and not ignored. Since | |
# these are filtered by XSLT this is hard to debug/fix. | |
# Nested | |
def test_nested_elements(self): | |
html = "<div>foo<p>Keep me</p></div>" | |
self.assertEqual( | |
self.transform.convert(html), | |
'<p>foo<p>Keep me</p></p>') | |
def test_strip_outer_whitespace(self): | |
html = ' <p>Keep me</p> ' | |
self.assertEqual( | |
self.transform.convert(html), | |
'<p>Keep me</p>') | |
def test_html_single(self): | |
html = "<html/>" | |
self.assertEqual(self.transform.convert(html), "") | |
def test_html_single_with_blank(self): | |
html = "<html />" | |
self.assertEqual(self.transform.convert(html), "") | |
def test_html_full_tag(self): | |
html = "<html></html>" | |
self.assertEqual(self.transform.convert(html), "") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment