Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# XXX: Make an HTML-level cleaner that converts:
# - <span style="font-family: Courier New, Courier, monospace;"> to <code>
# - <span style="font-family: Helvetica Neue, Arial, Helvetica, sans-serif;"> to nothing
# - &nbsp; to a literal space
# - <div> or <p> containing only <code> & <br> into a <pre>
#
# See if that cleans up ~ & $ escaping.
#
# If not, do a post-import Markdown cleanup
def clean_up_fonts(tree):
for span in tree.findall('.//span'):
style = span.attrib.pop('style', '')
if 'monospace' in style:
span.tag = 'code'
if span.text:
span.text = span.text.replace('\xa0', ' ')
def extract_code_block(node):
if node.text:
return
text = []
for child in node:
if child.tail and child.tail.strip():
return
tag = child.tag
if tag == 'br':
text.append('\n')
elif tag == 'code':
text.append(child.text)
else:
return
return ''.join(text)
def clean_up_pre(tree):
for node in itertools.chain(tree.findall('.//p'), tree.findall('.//div')):
code = extract_code_block(node)
if not code:
continue
node.clear()
node.text = code
node.tag = 'pre'
def clean_up_link_whitespace(tree):
for a in tree.findall('.//a'):
if not a.text:
continue
text = a.text.rstrip()
trailing = a.text[len(text):]
if trailing:
a.text = text
tail = a.tail if a.tail else ''
a.tail = trailing + tail
def clean_up_blogger(content):
if not content.strip():
return content
from StringIO import StringIO
from lxml import etree
parser = etree.HTMLParser()
tree = etree.parse(StringIO(content), parser)
clean_up_fonts(tree)
clean_up_pre(tree)
clean_up_link_whitespace(tree)
return etree.tostring(tree.getroot(), pretty_print=True, method='html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment