Skip to content

Instantly share code, notes, and snippets.

@arahaya
Created April 21, 2012 06:52
Show Gist options
  • Save arahaya/2434951 to your computer and use it in GitHub Desktop.
Save arahaya/2434951 to your computer and use it in GitHub Desktop.
Extract feed urls from HTML document
"""
required: Python 2.7+
required: beautifulsoup4
required: lxml
"""
from urlparse import urldefrag, urljoin
from lxml import etree
from bs4.dammit import UnicodeDammit
FEED_CONTENT_TYPES = [
"application/rss+xml",
"application/atom+xml",
"application/rdf+xml",
"text/xml",
"application/x.atom+xml",
"application/x-atom+xml"
]
class LinkTarget:
def __init__(self):
self.links = []
def start(self, tag, attrs):
if tag != "link":
return
if attrs.get("rel") != "alternate":
return
if attrs.get("type") not in FEED_CONTENT_TYPES:
return
if not attrs.get("href"):
return
self.links.append(attrs["href"])
def findfeeds(markup, base_url=""):
# parse with lxml
parser = etree.HTMLParser(target=LinkTarget(), recover=True)
try:
etree.fromstring(markup, parser)
except UnicodeDecodeError:
markup = UnicodeDammit(markup).unicode_markup
try:
etree.fromstring(markup, parser)
except UnicodeDecodeError:
pass
# cleanup and append base_url
feeds = []
for link in parser.target.links:
link = link.strip()
link, fragment = urldefrag(link)
link = urljoin(base_url, link)
feeds.append(link)
# only return unique urls
return list(set(feeds))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment