Skip to content

Instantly share code, notes, and snippets.

@chrigl
Last active August 29, 2015 14:16
Show Gist options
  • Save chrigl/d917a312dca25536ab0d to your computer and use it in GitHub Desktop.
Save chrigl/d917a312dca25536ab0d to your computer and use it in GitHub Desktop.
convert html to rst POC
# -*- coding: utf-8 -*-
from io import StringIO
from lxml import etree
pre_str = """
.. code-block::
%s
"""
img_str = """
.. thumbnail:: %s
:class: %s
"""
links_to_add = {}
def require_text(fn):
def _require_text(elem):
if not elem.text:
return None
return fn(elem)
return _require_text
def convert_link(elem):
links_to_add[elem.text] = elem.get('href', '')
return '`%s`_' % elem.text
@require_text
def convert_strong(elem):
return '**%s**' % elem.text
@require_text
def convert_pre(elem):
pre_content = StringIO(elem.text.decode('utf-8'))
def _convert_pre(text_io):
for line in text_io:
yield ' %s' % line
return pre_str % u''.join(_convert_pre(pre_content))
def convert_br(elem):
return '\n'
def convert_br_tail(elem):
if elem.getparent().tag == 'pre':
if elem.tail:
return ' %s' % elem.tail
if elem.tail:
return elem.tail
return ''
def convert_p(elem):
if elem.tail is not None:
return '\n%s' % elem.tail
return '\n'
def convert_img(elem):
return img_str % (elem.get('src'), elem.get('class'))
@require_text
def convert_null(elem):
return elem.text
def convert_null_tail(elem):
if elem.tail is not None:
return elem.tail
return None
def convert_pre_tail(elem):
if elem.tail is not None:
return '\n%s' % elem.tail
return '\n'
converter_registry = {
'a': convert_link,
'strong': convert_strong,
'pre': convert_pre,
'br': convert_br,
'img': convert_img,
}
converter_registry_tail = {
'p': convert_p,
'br': convert_br_tail,
'pre': convert_pre_tail,
}
def get_root(html):
parser = etree.HTMLParser()
tree = etree.parse(StringIO(html), parser)
return tree.getroot()
def recursive_runner(elem):
converter = converter_registry.get(elem.tag, convert_null)
res = converter(elem)
if res:
yield res
for elm in elem.iterchildren():
# there is no yield from in python2
for x in recursive_runner(elm):
yield x
converter = converter_registry_tail.get(elem.tag, convert_null_tail)
res = converter(elem)
if res:
yield res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment