Skip to content

Instantly share code, notes, and snippets.

@nanaze
Last active December 16, 2015 01:30
Show Gist options
  • Save nanaze/5356042 to your computer and use it in GitHub Desktop.
Save nanaze/5356042 to your computer and use it in GitHub Desktop.
import HTMLParser
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.hrefs = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href':
self.hrefs.append(value)
def GetLinksFromHtml(html_string):
parser = LinkParser()
parser.feed(html_string)
return list(parser.hrefs)
import linkextract
import unittest
class TestLinkParse(unittest.TestCase):
def testGetLinks(self):
self.assertEquals(
['foo', 'bar', 'baz'],
linkextract.GetLinksFromHtml(_TEST_HTML))
_TEST_HTML = """
<html>
<a href="foo">aaa</a>
<a href="bar">aaa</a>
<a href="baz">aaa</a>
</html>
"""
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment