Last active
August 29, 2015 14:24
-
-
Save waylan/2fe32146400be3436e79 to your computer and use it in GitHub Desktop.
A crazy idea I had as a way to parser HTML within a Markdown document.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A crazy Idea! | |
Just maybe, the way to parse HTML within a Markdown document is to run the document | |
through an HTML Parser first. Some parsers, like the HTMLParser included in the | |
Python Standard lib will properly parse the plain text not wrapped in HTML tags | |
as plain text and simply return it unaltered. The problem is with Markdown's | |
autolinks (`<foo@bar.com>` and `<http://example.com>`). | |
However, as of Python 2.7.3 and 3.2.2, the HTMLParser can now handle invalid HTML | |
without crashing hard. Below is a subclass of Beautiful Soup's HTML Parser which | |
accounts for those autolinks and passes them through as text. The crazy idea is that | |
those text nodes could then be parsed by Markdown and the Markdown Parser would not | |
need to reimplement a lousy HTML parser with regex. | |
""" | |
from bs4.builder._htmlparser import BeautifulSoupHTMLParser, HTMLParserTreeBuilder | |
from bs4 import BeautifulSoup | |
import re | |
try: | |
from HTMLParser import HTMLParseError | |
except ImportError, e: | |
# HTMLParseError is removed in Python 3.5. Since it can never be | |
# thrown in 3.5, we can just define our own class as a placeholder. | |
class HTMLParseError(Exception): | |
pass | |
AUTOLINKRE = re.compile(r'(^(f|ht)tps?:)|([^ \!]*@[^ ]*)') | |
class MDBeautifulSoupHTMLParser(BeautifulSoupHTMLParser): | |
def handle_starttag(self, name, attrs): | |
if AUTOLINKRE.match(name): | |
data = self.get_starttag_text() | |
self.soup.handle_data(data) | |
else: | |
BeautifulSoupHTMLParser.handle_starttag(self, name, attrs) | |
class MDHTMLParserTreeBuilder(HTMLParserTreeBuilder): | |
NAME = 'md' | |
def feed(self, markup): | |
args, kwargs = self.parser_args | |
parser = MDBeautifulSoupHTMLParser(*args, **kwargs) | |
parser.soup = self.soup | |
try: | |
parser.feed(markup) | |
except HTMLParseError, e: | |
warnings.warn(RuntimeWarning( | |
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) | |
raise e | |
doc = BeautifulSoup('<foo@bar.com>', builder=MDHTMLParserTreeBuilder()) | |
print doc.string | |
# Outputs: <foo@bar.com> | |
doc = BeautifulSoup('<http://example.com>', builder=MDHTMLParserTreeBuilder()) | |
print doc.string | |
# Outputs: <http://example.com> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment