Skip to content

Instantly share code, notes, and snippets.

@scanny
Created March 2, 2015 20:36
Show Gist options
  • Save scanny/4476085aa5f57b8b4fc0 to your computer and use it in GitHub Desktop.
Save scanny/4476085aa5f57b8b4fc0 to your computer and use it in GitHub Desktop.
Code to translate RestructuredText into Microsoft Word document using python-docx
# encoding: utf-8
"""
Helper objects for rendering to .docx format.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
from docutils import core
from lxml import etree
class RstRenderer(object):
"""
Service class that knows how to render a RestructuredText string to
a python-docx Document object.
"""
def __init__(self, blkcntnr, rst, style_overrides={}):
self._blkcntnr = blkcntnr
self._rst = rst
self._style_overrides = style_overrides
def render(self):
"""
Parse the RestructuredText in *rst* and render it into *blkcntnr* as
paragraphs, bullets, etc., including recognizing and rendering bold
and italic runs within block elements.
"""
self._render_container(self._rst_etree)
@property
def _styles(self):
"""
The dict providing lookup for style names for this RST document.
"""
if not hasattr(self, '_styles_'):
self._styles_ = {
'h1': 'Heading 1',
'p': 'Body Text',
'li': 'List Bullet',
'lc': 'List Continue',
'b': 'Strong',
'i': 'Emphasis',
}
self._styles_.update(self._style_overrides)
return self._styles_
def _render_container(self, container):
"""
Render each element in *container* in turn.
"""
for element in container:
tag = element.tag
if tag == 'section':
self._render_container(element)
elif tag == 'title':
self._render_paragraph(element, self._styles['h1'])
elif tag == 'paragraph':
self._render_paragraph(element, self._styles['p'])
elif tag == 'bullet_list':
self._render_bullet_list(element)
else:
raise NotImplementedError('unrecognized tag %s' % tag)
@property
def _rst_etree(self):
"""
Return the root element of a RestructuredText XML document produced by
converting *rst* to XML and then parsing that XML using lxml.
"""
def normalize_whitespace(elm):
if elm.text is not None:
elm.text = elm.text.replace('\n', ' ')
for child in elm:
normalize_whitespace(child)
if elm.tail is not None:
elm.tail = elm.tail.replace('\n', ' ')
root_element = etree.fromstring(self._rst_xml)
normalize_whitespace(root_element)
# ----
# with open('_scratch/rst2etree.xml', 'w') as f:
# f.write(etree.tostring(root_element))
# ----
return root_element
@property
def _rst_xml(self):
"""
Bytes containing XML corresponding to the RestructuredText in *rst*.
The XML vocabulary is a simple one using tags like `paragraph` and
`strong`.
"""
if self._rst is None:
return '<document/>'
return core.publish_string(source=self._rst, writer_name='xml')
def _render_bullet_list(self, bullet_list):
"""
Add a bullet to *blkcntnr* for each list item in *bullet_list*.
"""
def render_list_item(list_item):
for idx, para in enumerate(list_item):
style_key = 'li' if idx == 0 else 'lc'
self._render_paragraph(para, self._styles[style_key])
for list_item in bullet_list:
render_list_item(list_item)
def _render_paragraph(self, para, style):
"""
Add a new paragraph to *blkcntnr* containing the content in the
`paragraph` element *para*. Create appropriate runs for text having
strong and emphasis inline formatting.
"""
paragraph = self._blkcntnr.add_paragraph(style=style)
if para.text is not None:
paragraph.add_run(para.text)
for child in para:
style_key = {'strong': 'b', 'emphasis': 'i'}.get(child.tag)
if child.text is not None:
paragraph.add_run(child.text, self._styles[style_key])
if child.tail is not None:
paragraph.add_run(child.tail)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment