lrei/gist:1510028

## gistfile1.py
#!/usr/bin/env python

import Queue
import multiprocessing
import urllib2
import feedparser
import socket

feeds = ['http://today.reuters.com/rss/topNews',
          'http://today.reuters.com/rss/domesticNews',
          'http://today.reuters.com/rss/worldNews',
          'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
          'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
          'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
          'http://news.google.com/?output=rss',
          'http://feeds.salon.com/salon/news',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
          'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
          'http://rss.cnn.com/rss/edition.rss',
          'http://rss.cnn.com/rss/edition_world.rss',
          'http://rss.cnn.com/rss/edition_us.rss']

# timeout for feed fetch (in seconds)
FEED_TIMEOUT = 20

def fetch_urls(work_queue, results_queue):
    '''worker function - gets feed urls from queue and parses the feed'''
    while True:
        #grab feed url from queue
        try:
            feed_url = work_queue.get(block = False)
        except Queue.Empty:
            # if queue is empty this will end the thread
            break

        # download the feed
        feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read()
        except urllib2.URLError, e:
            continue # ignore this url

        # parse the feed
        parsed_feed = feedparser.parse(feed)

        for e in parsed_feed.entries:
            # get the links
            if 'link' in e:
                # push them into the results queue
                results_queue.put(link)


def main():
    # create and populate the work queue with all the feed urls
    work_queue = multiprocessing.Queue()
    for feed in feeds:
        work_queue.put(feed)

    # create results queue for all the links extracted from the feeds
    results_queue = multiprocessing.Queue()

    # spawn a bunch of workers for fetch pass them the work queue & results queue
    workers = []
    for i in range(len(feeds)):
        worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,))
        worker.start()
        workers.append(worker)

    # wait for all the workers to finish
    for worker in workers:
        worker.join()


main()
	#!/usr/bin/env python

	import Queue
	import multiprocessing
	import urllib2
	import feedparser
	import socket

	feeds = ['http://today.reuters.com/rss/topNews',
	'http://today.reuters.com/rss/domesticNews',
	'http://today.reuters.com/rss/worldNews',
	'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
	'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
	'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
	'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
	'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
	'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
	'http://news.google.com/?output=rss',
	'http://feeds.salon.com/salon/news',
	'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
	'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
	'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
	'http://rss.cnn.com/rss/edition.rss',
	'http://rss.cnn.com/rss/edition_world.rss',
	'http://rss.cnn.com/rss/edition_us.rss']

	# timeout for feed fetch (in seconds)
	FEED_TIMEOUT = 20

	def fetch_urls(work_queue, results_queue):
	'''worker function - gets feed urls from queue and parses the feed'''
	while True:
	#grab feed url from queue
	try:
	feed_url = work_queue.get(block = False)
	except Queue.Empty:
	# if queue is empty this will end the thread
	break

	# download the feed
	feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read()
	except urllib2.URLError, e:
	continue # ignore this url

	# parse the feed
	parsed_feed = feedparser.parse(feed)

	for e in parsed_feed.entries:
	# get the links
	if 'link' in e:
	# push them into the results queue
	results_queue.put(link)


	def main():
	# create and populate the work queue with all the feed urls
	work_queue = multiprocessing.Queue()
	for feed in feeds:
	work_queue.put(feed)

	# create results queue for all the links extracted from the feeds
	results_queue = multiprocessing.Queue()

	# spawn a bunch of workers for fetch pass them the work queue & results queue
	workers = []
	for i in range(len(feeds)):
	worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,))
	worker.start()
	workers.append(worker)

	# wait for all the workers to finish
	for worker in workers:
	worker.join()


	main()