Created
September 1, 2015 19:44
-
-
Save waylan/84eadbf6873965886a16 to your computer and use it in GitHub Desktop.
An experimental Raw HTML Parser for Markdown. This may or may not be a good idea.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
from HTMLParser import HTMLParser | |
except ImportError: | |
from html.parser import HTMLParser | |
HTML_BLOCK = set([ | |
'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', | |
'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form', | |
'fieldset', 'iframe', 'math', 'hr', 'style', 'li', 'dt', 'dd', | |
'thead', 'tbody', 'tr', 'th', 'td', 'section', 'footer', | |
'header', 'group', 'figure', 'figcaption', 'aside', 'article', | |
'canvas', 'output', 'progress', 'video', 'nav' | |
]) | |
class HTMLExtractor(HTMLParser): | |
stack = [] | |
inraw = False | |
_cache = [] | |
_stash = [] | |
newdoc = [] | |
def stash(self, value): | |
self._stash.append(value) | |
return '{{ stash:{0} }}'.format(len(self._stash)-1) | |
def handle_starttag(self, tag, attrs): | |
self.stack.append(tag) | |
line, col = self.getpos() | |
if col < 4 and tag in HTML_BLOCK and not self.inraw: | |
# Started a new raw block | |
self.inraw = True | |
text = self.get_starttag_text() | |
if self.inraw: | |
self._cache.append(text) | |
else: | |
self.newdoc.append(text) | |
def handle_endtag(self, tag): | |
text = '<{0}/>'.format(tag) | |
if tag in self.stack: | |
while self.stack: | |
if self.stack.pop() == tag: | |
break | |
if self.inraw and len(self.stack) == 0: | |
# End of raw block | |
self.inraw = False | |
self._cache.append(text) | |
self.newdoc.append(self.stash(''.join(self._cache))) | |
self._cache = [] | |
elif self.inraw: | |
self._cache.append(text) | |
else: | |
self.newdoc.append(text) | |
def handle_data(self, data): | |
if self.inraw: | |
self._cache.append(data) | |
else: | |
self.newdoc.append(data) | |
def parse(text): | |
parser = HTMLExtractor() | |
parser.feed(text) | |
return ''.join(parser.newdoc), parser._stash | |
if __name__ == '__main__': | |
t = """ | |
foo bar baz | |
<div> | |
<p>Blah blah!</p> | |
</div> | |
more text. | |
<p>a paragrpah</p> | |
And a code block: | |
<p>Code block</p> | |
<em>inline</em> stuff. | |
""" | |
doc, stash = parse(t) | |
print 'Doc:' | |
print doc | |
print 'Stash:', stash | |
# Outputs: | |
# ----------------------------------------------------------------------- | |
# Doc: | |
# | |
# foo bar baz | |
# | |
# { stash:0 } | |
# | |
# more text. | |
# | |
# { stash:1 } | |
# | |
# And a code block: | |
# | |
# <p>Code block<p/> | |
# | |
# <em>inline<em/> stuff. | |
# | |
# Stash: ['<div>\n\n<p>Blah blah!<p/>\n\n<div/>', '<p>a paragrpah<p/>'] | |
# ----------------------------------------------------------------------- |
Note that stash
is a quick and dirty placeholder. An instance of Python-Markdown Stash
class should be used instead.
Why is state being stored in class properties (stack, inraw, etc.) rather than instance properties?
Good point. Those should be instance properties. I was throwing together a simplistic proof-of-concept just to demonstrate that it works.
Also, it might be helpful to add a comment explaining what
stack
represents (e.g., "list of the tag name of each parent element of the current node") and what an example value would look like.
Yep, that's what it is. In the included example, for the text node Blah blah!
the stack would be ['div', 'p']
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Why is state being stored in class properties (stack, inraw, etc.) rather than instance properties?
Also, it might be helpful to add a comment explaining what
stack
represents (e.g., "list of the tag name of each parent element of the current node") and what an example value would look like.