waylan/RawHTMLParser.py

## RawHTMLParser.py
try:
    from HTMLParser import HTMLParser
except ImportError:
    from html.parser import HTMLParser


HTML_BLOCK = set([
    'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote',
    'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form',
    'fieldset', 'iframe', 'math', 'hr', 'style', 'li', 'dt', 'dd',
    'thead', 'tbody', 'tr', 'th', 'td', 'section', 'footer',
    'header', 'group', 'figure', 'figcaption', 'aside', 'article',
    'canvas', 'output', 'progress', 'video', 'nav'
])


class HTMLExtractor(HTMLParser):
    stack = []
    inraw = False
    _cache = []
    _stash = []
    newdoc = []

    def stash(self, value):
        self._stash.append(value)
        return '{{ stash:{0} }}'.format(len(self._stash)-1)

    def handle_starttag(self, tag, attrs):
        self.stack.append(tag)

        line, col = self.getpos()
        if col < 4 and tag in HTML_BLOCK and not self.inraw:
            # Started a new raw block
            self.inraw = True

        text = self.get_starttag_text()
        if self.inraw:
            self._cache.append(text)
        else:
            self.newdoc.append(text)

    def handle_endtag(self, tag):
        text = '<{0}/>'.format(tag)
        if tag in self.stack:
            while self.stack:
                if self.stack.pop() == tag:
                    break
        if self.inraw and len(self.stack) == 0:
            # End of raw block
            self.inraw = False
            self._cache.append(text)
            self.newdoc.append(self.stash(''.join(self._cache)))
            self._cache = []
        elif self.inraw:
            self._cache.append(text)
        else:
            self.newdoc.append(text)

    def handle_data(self, data):
        if self.inraw:
            self._cache.append(data)
        else:
            self.newdoc.append(data)

def parse(text):
    parser = HTMLExtractor()
    parser.feed(text)
    return ''.join(parser.newdoc), parser._stash

if __name__ == '__main__':
    t = """
foo bar baz

<div>

<p>Blah blah!</p>

</div>

more text.

<p>a paragrpah</p>

And a code block:

    <p>Code block</p>

<em>inline</em> stuff.
"""
    doc, stash = parse(t)
    print 'Doc:'
    print doc
    print
    print 'Stash:', stash

# Outputs:
# -----------------------------------------------------------------------
# Doc:
#
# foo bar baz
#
# { stash:0 }
#
# more text.
#
# { stash:1 }
#
# And a code block:
#
#     <p>Code block<p/>
#
# <em>inline<em/> stuff.
#
# Stash: ['<div>\n\n<p>Blah blah!<p/>\n\n<div/>', '<p>a paragrpah<p/>']
# -----------------------------------------------------------------------
	try:
	from HTMLParser import HTMLParser
	except ImportError:
	from html.parser import HTMLParser


	HTML_BLOCK = set([
	'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote',
	'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form',
	'fieldset', 'iframe', 'math', 'hr', 'style', 'li', 'dt', 'dd',
	'thead', 'tbody', 'tr', 'th', 'td', 'section', 'footer',
	'header', 'group', 'figure', 'figcaption', 'aside', 'article',
	'canvas', 'output', 'progress', 'video', 'nav'
	])


	class HTMLExtractor(HTMLParser):
	stack = []
	inraw = False
	_cache = []
	_stash = []
	newdoc = []

	def stash(self, value):
	self._stash.append(value)
	return '{{ stash:{0} }}'.format(len(self._stash)-1)

	def handle_starttag(self, tag, attrs):
	self.stack.append(tag)

	line, col = self.getpos()
	if col < 4 and tag in HTML_BLOCK and not self.inraw:
	# Started a new raw block
	self.inraw = True

	text = self.get_starttag_text()
	if self.inraw:
	self._cache.append(text)
	else:
	self.newdoc.append(text)

	def handle_endtag(self, tag):
	text = '<{0}/>'.format(tag)
	if tag in self.stack:
	while self.stack:
	if self.stack.pop() == tag:
	break
	if self.inraw and len(self.stack) == 0:
	# End of raw block
	self.inraw = False
	self._cache.append(text)
	self.newdoc.append(self.stash(''.join(self._cache)))
	self._cache = []
	elif self.inraw:
	self._cache.append(text)
	else:
	self.newdoc.append(text)

	def handle_data(self, data):
	if self.inraw:
	self._cache.append(data)
	else:
	self.newdoc.append(data)

	def parse(text):
	parser = HTMLExtractor()
	parser.feed(text)
	return ''.join(parser.newdoc), parser._stash

	if __name__ == '__main__':
	t = """
	foo bar baz

	<div>

	<p>Blah blah!</p>

	</div>

	more text.

	<p>a paragrpah</p>

	And a code block:

	<p>Code block</p>

	<em>inline</em> stuff.
	"""
	doc, stash = parse(t)
	print 'Doc:'
	print doc
	print
	print 'Stash:', stash

	# Outputs:
	# -----------------------------------------------------------------------
	# Doc:
	#
	# foo bar baz
	#
	# { stash:0 }
	#
	# more text.
	#
	# { stash:1 }
	#
	# And a code block:
	#
	# <p>Code block<p/>
	#
	# <em>inline<em/> stuff.
	#
	# Stash: ['<div>\n\n<p>Blah blah!<p/>\n\n<div/>', '<p>a paragrpah<p/>']
	# -----------------------------------------------------------------------