Last active
March 29, 2018 13:06
-
-
Save adewes/7583321 to your computer and use it in GitHub Desktop.
A map-reduce class in Python, with the typical "hello, world!" word-counting example. You can download ulysses.txt file used in the example here: http://www.gutenberg.org/ebooks/4300
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import abc | |
class MapReducer(object): | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractmethod | |
def map(self,items): | |
return [] | |
@abc.abstractmethod | |
def reduce(self,key,values): | |
return [] | |
def filter(self,items): | |
return items | |
def mapreduce(self,items): | |
map_results = [item for sublist in [self.map(item) for item in self.filter(items)] for item in sublist if item] | |
grouped_results = defaultdict(lambda :[]) | |
for key,value in map_results: | |
grouped_results[key].append(value) | |
return dict([(key,self.reduce(key,values)) for key,values in grouped_results.items()]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
from mapreduce import MapReducer | |
from collections import defaultdict | |
class WordReducer(MapReducer): | |
def map(self,item): | |
frequencies = defaultdict(lambda :0) | |
for word in item.strip().split(): | |
frequencies[word.lower()]+=1 | |
return frequencies.items() | |
def reduce(self,key,values): | |
return sum(values) | |
if __name__ == '__main__': | |
with open('ulysses.txt','r') as f: | |
lines = f.read().split("\n") | |
map_reducer = WordReducer() | |
result = map_reducer.mapreduce(lines) | |
pprint.pprint(sorted(result.items(),key = lambda x:-x[1])[:100]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[('the', 14850), | |
('of', 8212), | |
('and', 7051), | |
('a', 6403), | |
('to', 4912), | |
('in', 4796), | |
('he', 3620), | |
('his', 3275), | |
('with', 2485), | |
('i', 2442), | |
('that', 2336), | |
('was', 2037), | |
('on', 1979), | |
('for', 1901), | |
('it', 1874), | |
('you', 1604), | |
('her', 1585), | |
('is', 1301), | |
('at', 1279), | |
('by', 1247), | |
('all', 1207), | |
('as', 1166), | |
('him', 1123), | |
('from', 1075), | |
('she', 1021), | |
('or', 1004), | |
('they', 981), | |
('be', 846), | |
('not', 840), | |
('my', 815), | |
('had', 790), | |
('out', 782), | |
('what', 762), | |
('their', 710), | |
('like', 706), | |
('mr', 701), | |
('up', 674), | |
('have', 658), | |
('an', 652), | |
('me', 647), | |
('but', 644), | |
('one', 573), | |
('if', 539), | |
('so', 538), | |
('when', 531), | |
('there', 508), | |
('about', 504), | |
('them', 498), | |
('are', 498), | |
('were', 489), | |
('your', 486), | |
('said.', 480), | |
('then', 477), | |
('which', 474), | |
('old', 469), | |
('no', 462), | |
('says', 454), | |
('bloom', 442), | |
('this', 439), | |
('said', 428), | |
('who', 423), | |
('over', 401), | |
('we', 393), | |
('after', 388), | |
('down', 380), | |
('would', 369), | |
('do', 364), | |
('did', 358), | |
('two', 345), | |
('into', 328), | |
('see', 317), | |
('stephen', 316), | |
('will', 316), | |
('those', 311), | |
('off', 309), | |
('its', 305), | |
('some', 305), | |
('could', 297), | |
('our', 284), | |
('bloom:', 284), | |
('man', 283), | |
('little', 283), | |
('has', 282), | |
('other', 282), | |
('good', 276), | |
('where', 275), | |
('too', 274), | |
('said,', 265), | |
('more', 261), | |
('back', 256), | |
('time', 254), | |
('now', 252), | |
('get', 251), | |
('only', 243), | |
('know', 243), | |
('through', 242), | |
('how', 242), | |
('it.', 239), | |
('just', 238), | |
('never', 236)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment