Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Script to fetch and CBC rss feed and count the words

View kik.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
#!/usr/bin/env python
import threading,thread
from lxml import etree, html
import urllib
from StringIO import StringIO
 
# mapper function
def mapper(document):
words = document.split()
for w in words:
#filter the string
w = w.replace("'","").replace('"','')
yield w, 1
 
#reduce function is just a sum up function
reduce = sum
 
# global words counter
_words_count = dict()
 
class MapReduceThread(threading.Thread):
def __init__(self, link):
threading.Thread.__init__(self)
self.link = link
 
def run(self):
words_count = dict()
page = html.parse(self.link)
# xpath to get body each paragraph body inside the div which has the id=storybody
content = page.xpath("//div[@id='storybody']/p/text()")
for doc in content:
#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards)
map = mapper(doc)
for number,weight in map:
# set default return the number in the dictionary, and if the key is not there create a empty list()
words_count.setdefault(number, list()).append(weight)
a_lock = thread.allocate_lock()
with a_lock:
#reduce
for word, group in words_count.items():
# try to get the number of that word! if is not there get 0
n = _words_count.setdefault(word, 0)
_words_count[word] = n + reduce(group)
 
def main():
# feed URL
URL = 'http://rss.cbc.ca/lineup/topstories.xml'
# open the feed
file = urllib.urlopen(URL)
sio = StringIO(file.read())
# parse the content into a XML tree
tree = etree.parse(sio)
 
# xpath to find all links in the feed
arr = tree.xpath('/rss/channel/item/link')
 
for rr in arr:
threads = []
try:
thread1 = MapReduceThread(rr.text)
thread1.start()
#thread1.join()
threads.append(thread1)
except:
print "Error: unable to start thread"
#wait for MapReduce in all documents
for t in threads:
t.join()
 
#sort and get the first 50
words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50]
 
#display them
count = 1
for w in words_arr:
print '%d %d %s' % (count,_words_count[w],w)
count += 1
 
 
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.