public
Last active

Script to fetch and CBC rss feed and count the words

  • Download Gist
kik.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
#!/usr/bin/env python
import threading,thread
from lxml import etree, html
import urllib
from StringIO import StringIO
 
# mapper function
def mapper(document):
words = document.split()
for w in words:
#filter the string
w = w.replace("'","").replace('"','')
yield w, 1
 
#reduce function is just a sum up function
reduce = sum
 
# global words counter
_words_count = dict()
 
class MapReduceThread(threading.Thread):
def __init__(self, link):
threading.Thread.__init__(self)
self.link = link
 
def run(self):
words_count = dict()
page = html.parse(self.link)
# xpath to get body each paragraph body inside the div which has the id=storybody
content = page.xpath("//div[@id='storybody']/p/text()")
for doc in content:
#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards)
map = mapper(doc)
for number,weight in map:
# set default return the number in the dictionary, and if the key is not there create a empty list()
words_count.setdefault(number, list()).append(weight)
a_lock = thread.allocate_lock()
with a_lock:
#reduce
for word, group in words_count.items():
# try to get the number of that word! if is not there get 0
n = _words_count.setdefault(word, 0)
_words_count[word] = n + reduce(group)
 
def main():
# feed URL
URL = 'http://rss.cbc.ca/lineup/topstories.xml'
# open the feed
file = urllib.urlopen(URL)
sio = StringIO(file.read())
# parse the content into a XML tree
tree = etree.parse(sio)
 
# xpath to find all links in the feed
arr = tree.xpath('/rss/channel/item/link')
 
for rr in arr:
threads = []
try:
thread1 = MapReduceThread(rr.text)
thread1.start()
#thread1.join()
threads.append(thread1)
except:
print "Error: unable to start thread"
#wait for MapReduce in all documents
for t in threads:
t.join()
 
#sort and get the first 50
words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50]
 
#display them
count = 1
for w in words_arr:
print '%d %d %s' % (count,_words_count[w],w)
count += 1
 
 
if __name__ == '__main__':
main()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.