Skip to content

Instantly share code, notes, and snippets.

@waylan
Created March 13, 2013 14:28

Revisions

  1. Waylan Limberg created this gist Mar 13, 2013.
    75 changes: 75 additions & 0 deletions html_tidy.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,75 @@
    # HTML Tidy Extension for Python-Markdown
    # =======================================
    #
    # Runs [HTML Tidy][] on the output of Python-Markdown using the [uTidylib][]
    # Python wrapper. Both libtidy and uTidylib must be installed on your system.
    #
    # [HTML Tidy]: http://tidy.sourceforge.net/
    # [uTidylib]: http://utidylib.berlios.de/
    #
    # Note than any Tidy [options][] can be passed in as [extension configs][]. So,
    # for example, to output HTML rather than XHTML, set ``output_xhtml=0``. To
    # indent the output, set ``indent=auto`` and to have Tidy wrap the output in
    # ``<html>`` and ``<body>`` tags, set ``show_body_only=0``. See Tidy's
    # [options][] for a full list of the available options. The defaults are set to
    # most closely match Markdowns defaults with the exception that you get much
    # better pretty-printing.
    #
    # [options]: http://tidy.sourceforge.net/docs/quickref.html
    # [extension configs]: ../reference.html#extension_configs
    #
    # Note that options set in this extension will override most any other settings
    # passed on to Markdown (such as "output_format"). Unlike Markdown, this extension
    # will also treat raw HTML no different than that output by Markdown. In other
    # words, it may munge a document authors carefully crafted HTML. Of course, it
    # may also transform poorly formed raw HTML into nice, valid HTML. Take these
    # things into consideration when electing to use this extension.
    #
    # Copyright (c)2008 [Waylan Limberg](http://achinghead.com)
    #
    # License: [BSD](http://www.opensource.org/licenses/bsd-license.php)

    from __future__ import absolute_import
    from . import Extension
    from ..postprocessors import Postprocessor
    from ..util import text_type
    try:
    import tidy
    except ImportError:
    tidy = None

    class TidyExtension(Extension):

    def __init__(self, configs):
    # Set defaults to match typical markdown behavior.
    self.config = dict(output_xhtml=1,
    show_body_only=1,
    char_encoding='utf8'
    )
    # Merge in user defined configs overriding any present if nessecary.
    for c in configs:
    self.config[c[0]] = c[1]

    def extendMarkdown(self, md, md_globals):
    # Save options to markdown instance
    md.tidy_options = self.config
    # Add TidyProcessor to postprocessors
    if tidy:
    md.postprocessors['tidy'] = TidyProcessor(md)


    class TidyProcessor(Postprocessor):

    def run(self, text):
    # Pass text to Tidy. As Tidy does not accept unicode we need to encode
    # it and decode its return value.
    enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
    return text_type(tidy.parseString(text.encode(enc),
    **self.markdown.tidy_options),
    encoding=enc)


    def makeExtension(configs=None):
    return TidyExtension(configs=configs)