Skip to content

Instantly share code, notes, and snippets.

@waylan
Created September 1, 2015 19:44
Show Gist options
  • Save waylan/84eadbf6873965886a16 to your computer and use it in GitHub Desktop.
Save waylan/84eadbf6873965886a16 to your computer and use it in GitHub Desktop.
An experimental Raw HTML Parser for Markdown. This may or may not be a good idea.
try:
from HTMLParser import HTMLParser
except ImportError:
from html.parser import HTMLParser
HTML_BLOCK = set([
'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote',
'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form',
'fieldset', 'iframe', 'math', 'hr', 'style', 'li', 'dt', 'dd',
'thead', 'tbody', 'tr', 'th', 'td', 'section', 'footer',
'header', 'group', 'figure', 'figcaption', 'aside', 'article',
'canvas', 'output', 'progress', 'video', 'nav'
])
class HTMLExtractor(HTMLParser):
stack = []
inraw = False
_cache = []
_stash = []
newdoc = []
def stash(self, value):
self._stash.append(value)
return '{{ stash:{0} }}'.format(len(self._stash)-1)
def handle_starttag(self, tag, attrs):
self.stack.append(tag)
line, col = self.getpos()
if col < 4 and tag in HTML_BLOCK and not self.inraw:
# Started a new raw block
self.inraw = True
text = self.get_starttag_text()
if self.inraw:
self._cache.append(text)
else:
self.newdoc.append(text)
def handle_endtag(self, tag):
text = '<{0}/>'.format(tag)
if tag in self.stack:
while self.stack:
if self.stack.pop() == tag:
break
if self.inraw and len(self.stack) == 0:
# End of raw block
self.inraw = False
self._cache.append(text)
self.newdoc.append(self.stash(''.join(self._cache)))
self._cache = []
elif self.inraw:
self._cache.append(text)
else:
self.newdoc.append(text)
def handle_data(self, data):
if self.inraw:
self._cache.append(data)
else:
self.newdoc.append(data)
def parse(text):
parser = HTMLExtractor()
parser.feed(text)
return ''.join(parser.newdoc), parser._stash
if __name__ == '__main__':
t = """
foo bar baz
<div>
<p>Blah blah!</p>
</div>
more text.
<p>a paragrpah</p>
And a code block:
<p>Code block</p>
<em>inline</em> stuff.
"""
doc, stash = parse(t)
print 'Doc:'
print doc
print
print 'Stash:', stash
# Outputs:
# -----------------------------------------------------------------------
# Doc:
#
# foo bar baz
#
# { stash:0 }
#
# more text.
#
# { stash:1 }
#
# And a code block:
#
# <p>Code block<p/>
#
# <em>inline<em/> stuff.
#
# Stash: ['<div>\n\n<p>Blah blah!<p/>\n\n<div/>', '<p>a paragrpah<p/>']
# -----------------------------------------------------------------------
@ryneeverett
Copy link

Why is state being stored in class properties (stack, inraw, etc.) rather than instance properties?

Also, it might be helpful to add a comment explaining what stack represents (e.g., "list of the tag name of each parent element of the current node") and what an example value would look like.

@waylan
Copy link
Author

waylan commented Nov 15, 2017

Note that stash is a quick and dirty placeholder. An instance of Python-Markdown Stash class should be used instead.

Why is state being stored in class properties (stack, inraw, etc.) rather than instance properties?

Good point. Those should be instance properties. I was throwing together a simplistic proof-of-concept just to demonstrate that it works.

Also, it might be helpful to add a comment explaining what stack represents (e.g., "list of the tag name of each parent element of the current node") and what an example value would look like.

Yep, that's what it is. In the included example, for the text node Blah blah! the stack would be ['div', 'p'].

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment