holdenweb/a_better_opml.py

## a_better_opml.py
import tweepy
from BeautifulSoup import BeautifulSoup as parser
import urllib

consumer_key=''
consumer_secret=''

access_token=''
access_token_secret=''

# http://stackoverflow.com/questions/7883581/automatically-extracting-feed-links-atom-rss-etc-from-webpages
def detect_feeds_in_HTML(input_stream):
    """ examines an open text stream with HTML for referenced feeds.

    This is achieved by detecting all ``link`` tags that reference a feed in HTML.

    :param input_stream: an arbitrary opened input stream that has a :func:`read` method.
    :type input_stream: an input stream (e.g. open file or URL)
    :return: a list of tuples ``(url, feed_type)``
    :rtype: ``list(tuple(str, str))``
    """
    # check if really an input stream
    if not hasattr(input_stream, "read"):
        raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
    result = []
    # get the textual data (the HTML) from the input stream
    html = parser(input_stream.read())
    # find all links that have an "alternate" attribute
    feed_urls = html.findAll("link", rel="alternate")
    # extract URL and type
    for feed_link in feed_urls:
        url = feed_link.get("href", None)
        # if a valid URL is there
        if url:
            result.append(url)
    return result

def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)
me = api.me()
friends = api.friends_ids()

# https://gist.github.com/1051517
opml_start = """<?xml version="1.0" encoding="UTF-8"?>
<opml version="1.1">
<head>
<title>People I follow</title>
</head>
<body>
<outline text="People I follow" title="People I follow">"""

opml_end = """</outline>
</body>
</opml>"""

opml_outline_feed = '<outline text="%(title)s" title="%(title)s" type="rss" version="RSS" htmlUrl="%(html_url)s" xmlUrl="%(xml_url)s" />'

print opml_start
for c in chunks(friends, 100):
    users = api.lookup_users(c)
    for u in users:
        if u.url:
           print "<!-- %s -->" % u.screen_name
           try:
               site = urllib.urlopen(u.url)
               tuples = detect_feeds_in_HTML(site)
               for t in tuples:
                   print opml_outline_feed % {'title': u.name + t, 'html_url': u.url, 'xml_url': t}
           except Exception, e:
                print "<!-- fail: %s: %s -->" % (e.__class__, e.message)
                pass

print opml_end
	import tweepy
	from BeautifulSoup import BeautifulSoup as parser
	import urllib

	consumer_key=''
	consumer_secret=''

	access_token=''
	access_token_secret=''

	# http://stackoverflow.com/questions/7883581/automatically-extracting-feed-links-atom-rss-etc-from-webpages
	def detect_feeds_in_HTML(input_stream):
	""" examines an open text stream with HTML for referenced feeds.

	This is achieved by detecting all ``link`` tags that reference a feed in HTML.

	:param input_stream: an arbitrary opened input stream that has a :func:`read` method.
	:type input_stream: an input stream (e.g. open file or URL)
	:return: a list of tuples ``(url, feed_type)``
	:rtype: ``list(tuple(str, str))``
	"""
	# check if really an input stream
	if not hasattr(input_stream, "read"):
	raise TypeError("An opened input stream should be given, was %s instead!" % type(input_stream))
	result = []
	# get the textual data (the HTML) from the input stream
	html = parser(input_stream.read())
	# find all links that have an "alternate" attribute
	feed_urls = html.findAll("link", rel="alternate")
	# extract URL and type
	for feed_link in feed_urls:
	url = feed_link.get("href", None)
	# if a valid URL is there
	if url:
	result.append(url)
	return result

	def chunks(l, n):
	for i in xrange(0, len(l), n):
	yield l[i:i+n]


	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)

	api = tweepy.API(auth)
	me = api.me()
	friends = api.friends_ids()

	# https://gist.github.com/1051517
	opml_start = """<?xml version="1.0" encoding="UTF-8"?>
	<opml version="1.1">
	<head>
	<title>People I follow</title>
	</head>
	<body>
	<outline text="People I follow" title="People I follow">"""

	opml_end = """</outline>
	</body>
	</opml>"""

	opml_outline_feed = '<outline text="%(title)s" title="%(title)s" type="rss" version="RSS" htmlUrl="%(html_url)s" xmlUrl="%(xml_url)s" />'

	print opml_start
	for c in chunks(friends, 100):
	users = api.lookup_users(c)
	for u in users:
	if u.url:
	print "<!-- %s -->" % u.screen_name
	try:
	site = urllib.urlopen(u.url)
	tuples = detect_feeds_in_HTML(site)
	for t in tuples:
	print opml_outline_feed % {'title': u.name + t, 'html_url': u.url, 'xml_url': t}
	except Exception, e:
	print "<!-- fail: %s: %s -->" % (e.__class__, e.message)
	pass

	print opml_end