Skip to content

@arthurnn /kik.py
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Script to fetch and CBC rss feed and count the words
#!/usr/bin/env python
import threading,thread
from lxml import etree, html
import urllib
from StringIO import StringIO
# mapper function
def mapper(document):
words = document.split()
for w in words:
#filter the string
w = w.replace("'","").replace('"','')
yield w, 1
#reduce function is just a sum up function
reduce = sum
# global words counter
_words_count = dict()
class MapReduceThread(threading.Thread):
def __init__(self, link):
threading.Thread.__init__(self)
self.link = link
def run(self):
words_count = dict()
page = html.parse(self.link)
# xpath to get body each paragraph body inside the div which has the id=storybody
content = page.xpath("//div[@id='storybody']/p/text()")
for doc in content:
#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards)
map = mapper(doc)
for number,weight in map:
# set default return the number in the dictionary, and if the key is not there create a empty list()
words_count.setdefault(number, list()).append(weight)
a_lock = thread.allocate_lock()
with a_lock:
#reduce
for word, group in words_count.items():
# try to get the number of that word! if is not there get 0
n = _words_count.setdefault(word, 0)
_words_count[word] = n + reduce(group)
def main():
# feed URL
URL = 'http://rss.cbc.ca/lineup/topstories.xml'
# open the feed
file = urllib.urlopen(URL)
sio = StringIO(file.read())
# parse the content into a XML tree
tree = etree.parse(sio)
# xpath to find all links in the feed
arr = tree.xpath('/rss/channel/item/link')
for rr in arr:
threads = []
try:
thread1 = MapReduceThread(rr.text)
thread1.start()
#thread1.join()
threads.append(thread1)
except:
print "Error: unable to start thread"
#wait for MapReduce in all documents
for t in threads:
t.join()
#sort and get the first 50
words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50]
#display them
count = 1
for w in words_arr:
print '%d %d %s' % (count,_words_count[w],w)
count += 1
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.