waylan/parser.py

## parser.py
"""
A crazy Idea!

Just maybe, the way to parse HTML within a Markdown document is to run the document
through an HTML Parser first. Some parsers, like the HTMLParser included in the
Python Standard lib will properly parse the plain text not wrapped in HTML tags
as plain text and simply return it unaltered. The problem is with Markdown's
autolinks (`<foo@bar.com>` and `<http://example.com>`).

However, as of Python 2.7.3 and 3.2.2, the HTMLParser can now handle invalid HTML
without crashing hard. Below is a subclass of Beautiful Soup's HTML Parser which
accounts for those autolinks and passes them through as text. The crazy idea is that
those text nodes could then be parsed by Markdown and the Markdown Parser would not
need to reimplement a lousy HTML parser with regex.
"""

from bs4.builder._htmlparser import BeautifulSoupHTMLParser, HTMLParserTreeBuilder
from bs4 import BeautifulSoup
import re

try:
    from HTMLParser import HTMLParseError
except ImportError, e:
    # HTMLParseError is removed in Python 3.5. Since it can never be
    # thrown in 3.5, we can just define our own class as a placeholder.
    class HTMLParseError(Exception):
        pass


AUTOLINKRE = re.compile(r'(^(f|ht)tps?:)|([^ \!]*@[^ ]*)')


class MDBeautifulSoupHTMLParser(BeautifulSoupHTMLParser):
    def handle_starttag(self, name, attrs):
        if AUTOLINKRE.match(name):
            data = self.get_starttag_text()
            self.soup.handle_data(data)
        else:
            BeautifulSoupHTMLParser.handle_starttag(self, name, attrs)


class MDHTMLParserTreeBuilder(HTMLParserTreeBuilder):
    NAME = 'md'

    def feed(self, markup):
        args, kwargs = self.parser_args
        parser = MDBeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e


doc = BeautifulSoup('<foo@bar.com>', builder=MDHTMLParserTreeBuilder())
print doc.string
# Outputs: <foo@bar.com>

doc = BeautifulSoup('<http://example.com>', builder=MDHTMLParserTreeBuilder())
print doc.string
# Outputs: <http://example.com>
	"""
	A crazy Idea!

	Just maybe, the way to parse HTML within a Markdown document is to run the document
	through an HTML Parser first. Some parsers, like the HTMLParser included in the
	Python Standard lib will properly parse the plain text not wrapped in HTML tags
	as plain text and simply return it unaltered. The problem is with Markdown's
	autolinks (`<foo@bar.com>` and `<http://example.com>`).

	However, as of Python 2.7.3 and 3.2.2, the HTMLParser can now handle invalid HTML
	without crashing hard. Below is a subclass of Beautiful Soup's HTML Parser which
	accounts for those autolinks and passes them through as text. The crazy idea is that
	those text nodes could then be parsed by Markdown and the Markdown Parser would not
	need to reimplement a lousy HTML parser with regex.
	"""

	from bs4.builder._htmlparser import BeautifulSoupHTMLParser, HTMLParserTreeBuilder
	from bs4 import BeautifulSoup
	import re

	try:
	from HTMLParser import HTMLParseError
	except ImportError, e:
	# HTMLParseError is removed in Python 3.5. Since it can never be
	# thrown in 3.5, we can just define our own class as a placeholder.
	class HTMLParseError(Exception):
	pass


	AUTOLINKRE = re.compile(r'(^(f\|ht)tps?:)\|([^ \!]@[^ ])')


	class MDBeautifulSoupHTMLParser(BeautifulSoupHTMLParser):
	def handle_starttag(self, name, attrs):
	if AUTOLINKRE.match(name):
	data = self.get_starttag_text()
	self.soup.handle_data(data)
	else:
	BeautifulSoupHTMLParser.handle_starttag(self, name, attrs)


	class MDHTMLParserTreeBuilder(HTMLParserTreeBuilder):
	NAME = 'md'

	def feed(self, markup):
	args, kwargs = self.parser_args
	parser = MDBeautifulSoupHTMLParser(args, *kwargs)
	parser.soup = self.soup
	try:
	parser.feed(markup)
	except HTMLParseError, e:
	warnings.warn(RuntimeWarning(
	"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
	raise e


	doc = BeautifulSoup('<foo@bar.com>', builder=MDHTMLParserTreeBuilder())
	print doc.string
	# Outputs: <foo@bar.com>

	doc = BeautifulSoup('<http://example.com>', builder=MDHTMLParserTreeBuilder())
	print doc.string
	# Outputs: <http://example.com>