Last active
August 29, 2015 14:16
-
-
Save chrigl/d917a312dca25536ab0d to your computer and use it in GitHub Desktop.
convert html to rst POC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from io import StringIO | |
from lxml import etree | |
pre_str = """ | |
.. code-block:: | |
%s | |
""" | |
img_str = """ | |
.. thumbnail:: %s | |
:class: %s | |
""" | |
links_to_add = {} | |
def require_text(fn): | |
def _require_text(elem): | |
if not elem.text: | |
return None | |
return fn(elem) | |
return _require_text | |
def convert_link(elem): | |
links_to_add[elem.text] = elem.get('href', '') | |
return '`%s`_' % elem.text | |
@require_text | |
def convert_strong(elem): | |
return '**%s**' % elem.text | |
@require_text | |
def convert_pre(elem): | |
pre_content = StringIO(elem.text.decode('utf-8')) | |
def _convert_pre(text_io): | |
for line in text_io: | |
yield ' %s' % line | |
return pre_str % u''.join(_convert_pre(pre_content)) | |
def convert_br(elem): | |
return '\n' | |
def convert_br_tail(elem): | |
if elem.getparent().tag == 'pre': | |
if elem.tail: | |
return ' %s' % elem.tail | |
if elem.tail: | |
return elem.tail | |
return '' | |
def convert_p(elem): | |
if elem.tail is not None: | |
return '\n%s' % elem.tail | |
return '\n' | |
def convert_img(elem): | |
return img_str % (elem.get('src'), elem.get('class')) | |
@require_text | |
def convert_null(elem): | |
return elem.text | |
def convert_null_tail(elem): | |
if elem.tail is not None: | |
return elem.tail | |
return None | |
def convert_pre_tail(elem): | |
if elem.tail is not None: | |
return '\n%s' % elem.tail | |
return '\n' | |
converter_registry = { | |
'a': convert_link, | |
'strong': convert_strong, | |
'pre': convert_pre, | |
'br': convert_br, | |
'img': convert_img, | |
} | |
converter_registry_tail = { | |
'p': convert_p, | |
'br': convert_br_tail, | |
'pre': convert_pre_tail, | |
} | |
def get_root(html): | |
parser = etree.HTMLParser() | |
tree = etree.parse(StringIO(html), parser) | |
return tree.getroot() | |
def recursive_runner(elem): | |
converter = converter_registry.get(elem.tag, convert_null) | |
res = converter(elem) | |
if res: | |
yield res | |
for elm in elem.iterchildren(): | |
# there is no yield from in python2 | |
for x in recursive_runner(elm): | |
yield x | |
converter = converter_registry_tail.get(elem.tag, convert_null_tail) | |
res = converter(elem) | |
if res: | |
yield res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment