Skip to content

Instantly share code, notes, and snippets.

@Querela
Created December 18, 2015 21:53
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save Querela/735379e9c853171713ca to your computer and use it in GitHub Desktop.
Save Querela/735379e9c853171713ca to your computer and use it in GitHub Desktop.
[python][html][feed][rss] feed finder from url
#!/usr/local/bin/python3
#
# feedfinder.py
#
# Utils for finding feeds
#
import sys
from datetime import datetime
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
DEBUG = True
class Feed:
url = None
title = None
feed_type = None
is_main = False
created = None
def __init__(self, url, title=None, feed_type=None, is_main=False):
self.url = url
self.title = title
self.feed_type = feed_type
self.is_main = is_main
self.created = datetime.utcnow()
def __repr__(self):
return '<%sFeed%s %s "%s">' % (
("%s-" % self.feed_type if not self.feed_type is None else ""),
('*' if self.is_main else ''), self.url, self.title)
class FeedFinder:
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'
def _get_html(self, url):
'''[Internal]
Downloads the given URL with urllib2. Applies an custom User-agent
to appear more like a browser and not a spider/crawler/bot.'''
request = Request(url)
request.add_header('User-Agent', self.USER_AGENT)
response = urlopen(request)
content = response.read()
response.close()
return content
def _find_rss_feeds_in_header(self, soup):
'''[Internal]
Searches for RSS feed.
<link type="application/rss+xml" rel="alternate" ...>
Returns a list. Either empty if no feed URL was found or with Feed
objects. Marks the shortest feed URL as the main one.
Analogous to _find_atom_feeds_in_header().'''
links = soup.find_all("link", attrs={'type': 'application/rss+xml', 'rel': 'alternate'})
feeds = []
for link in links:
feeds.append(Feed(link["href"], link["title"], "RSS"))
self._find_shortest_feed_url_and_update(feeds)
return feeds
def _find_atom_feeds_in_header(self, soup):
'''[Internal]
Searches for RSS feed.
<link type="application/atom+xml" rel="alternate" ...>
Returns a list. Either empty if no feed URL was found or with Feed
objects. Marks the shortest feed URL as the main one.
Analogous to _find_rss_feeds_in_header().'''
links = soup.find_all("link", attrs={'type': 'application/atom+xml', 'rel': 'alternate'})
feeds = []
for link in links:
feeds.append(Feed(link["href"], link["title"], "Atom"))
self._find_shortest_feed_url_and_update(feeds)
return feeds
def _find_shortest_feed_url_and_update(self, feeds):
'''[Internal]
Searches for the shortest url in all the feeds. This is probably
the main feed (url). It will also update the feed list and mark the
shortest feed as the main one.
If feeds is None or feeds is an empty list (e. g. []) then None is
returned. Otherwise the shortest feed will be returned.'''
if feeds is None or feeds == []:
return None
shortest_feed = feeds[0]
shortest_feed_length = len(shortest_feed.url)
for feed in feeds:
if len(feed.url) < shortest_feed_length:
shortest_feed = feed
shortest_feed_length = len(shortest_feed.url)
shortest_feed.is_main = True
return shortest_feed
def _find_feeds_in_header(self, html):
'''[Internal]
Parses the given HTML string with BeautifulSoup4 and searches for
either RSS or Atom feed URLs. If both are available they will be
concatenated. If no feed URL is found, an empty list will be returned.
'''
soup = BeautifulSoup(html, 'html.parser')
feeds = []
feeds += self._find_rss_feeds_in_header(soup)
feeds += self._find_atom_feeds_in_header(soup)
return feeds
def find_feeds(self, url):
'''Tries to find RSS or Atom feed for the given URL.
Returns an empty list if no URL was found.'''
try:
html = self._get_html(url)
feeds = self._find_feeds_in_header(html)
return feeds
except ValueError as e:
if DEBUG:
import traceback
print(traceback.format_exc().splitlines()[-1])
return []
except:
if DEBUG:
import traceback
print(traceback.format_exc())
return []
def find_feeds_m(self, urls):
'''Tries to find feeds for a list of URLs. Groups the results for each
URL.'''
result = {}
urls = urls if urls is not None else []
for url in urls:
result[url] = self.find_feeds(url)
return result
if __name__ == '__main__':
if len(sys.argv) < 2:
print('>> No url parameter given!')
print('>> Execute:\n %s url1 [url2 [...]]' % sys.argv[0])
quit(1)
url_s = sys.argv[1:]
result = FeedFinder().find_feeds_m(url_s)
for url, feeds in result.items():
print(url)
for feed in feeds:
print(' %s %s - %s' % (('*' if feed.is_main else ' '),
feed.url, feed.title))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment