Skip to content

Instantly share code, notes, and snippets.

@holdenweb
Forked from selenamarie/a_better_opml.py
Last active December 11, 2015 09:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save holdenweb/4583933 to your computer and use it in GitHub Desktop.
Save holdenweb/4583933 to your computer and use it in GitHub Desktop.
import tweepy
from BeautifulSoup import BeautifulSoup as parser
import urllib
consumer_key=''
consumer_secret=''
access_token=''
access_token_secret=''
# http://stackoverflow.com/questions/7883581/automatically-extracting-feed-links-atom-rss-etc-from-webpages
def detect_feeds_in_HTML(input_stream):
""" examines an open text stream with HTML for referenced feeds.
This is achieved by detecting all ``link`` tags that reference a feed in HTML.
:param input_stream: an arbitrary opened input stream that has a :func:`read` method.
:type input_stream: an input stream (e.g. open file or URL)
:return: a list of tuples ``(url, feed_type)``
:rtype: ``list(tuple(str, str))``
"""
# check if really an input stream
if not hasattr(input_stream, "read"):
raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
result = []
# get the textual data (the HTML) from the input stream
html = parser(input_stream.read())
# find all links that have an "alternate" attribute
feed_urls = html.findAll("link", rel="alternate")
# extract URL and type
for feed_link in feed_urls:
url = feed_link.get("href", None)
# if a valid URL is there
if url:
result.append(url)
return result
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
me = api.me()
friends = api.friends_ids()
# https://gist.github.com/1051517
opml_start = """<?xml version="1.0" encoding="UTF-8"?>
<opml version="1.1">
<head>
<title>People I follow</title>
</head>
<body>
<outline text="People I follow" title="People I follow">"""
opml_end = """</outline>
</body>
</opml>"""
opml_outline_feed = '<outline text="%(title)s" title="%(title)s" type="rss" version="RSS" htmlUrl="%(html_url)s" xmlUrl="%(xml_url)s" />'
print opml_start
for c in chunks(friends, 100):
users = api.lookup_users(c)
for u in users:
if u.url:
print "<!-- %s -->" % u.screen_name
try:
site = urllib.urlopen(u.url)
tuples = detect_feeds_in_HTML(site)
for t in tuples:
print opml_outline_feed % {'title': u.name + t, 'html_url': u.url, 'xml_url': t}
except Exception, e:
print "<!-- fail: %s: %s -->" % (e.__class__, e.message)
pass
print opml_end
@holdenweb
Copy link
Author

Slightly better error handling at https://gist.github.com/4583933

@selenamarie
Copy link

Heh. I added that into a new revision that also does that whole HTML entity thing. And creates real URLs out of relative URLs for feeds passed in by.. most of the internet.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment