Skip to content

Instantly share code, notes, and snippets.

@adewes
Last active March 29, 2018 13:06
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adewes/7583321 to your computer and use it in GitHub Desktop.
Save adewes/7583321 to your computer and use it in GitHub Desktop.
A map-reduce class in Python, with the typical "hello, world!" word-counting example. You can download ulysses.txt file used in the example here: http://www.gutenberg.org/ebooks/4300
from collections import defaultdict
import abc
class MapReducer(object):
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def map(self,items):
return []
@abc.abstractmethod
def reduce(self,key,values):
return []
def filter(self,items):
return items
def mapreduce(self,items):
map_results = [item for sublist in [self.map(item) for item in self.filter(items)] for item in sublist if item]
grouped_results = defaultdict(lambda :[])
for key,value in map_results:
grouped_results[key].append(value)
return dict([(key,self.reduce(key,values)) for key,values in grouped_results.items()])
import pprint
from mapreduce import MapReducer
from collections import defaultdict
class WordReducer(MapReducer):
def map(self,item):
frequencies = defaultdict(lambda :0)
for word in item.strip().split():
frequencies[word.lower()]+=1
return frequencies.items()
def reduce(self,key,values):
return sum(values)
if __name__ == '__main__':
with open('ulysses.txt','r') as f:
lines = f.read().split("\n")
map_reducer = WordReducer()
result = map_reducer.mapreduce(lines)
pprint.pprint(sorted(result.items(),key = lambda x:-x[1])[:100])
[('the', 14850),
('of', 8212),
('and', 7051),
('a', 6403),
('to', 4912),
('in', 4796),
('he', 3620),
('his', 3275),
('with', 2485),
('i', 2442),
('that', 2336),
('was', 2037),
('on', 1979),
('for', 1901),
('it', 1874),
('you', 1604),
('her', 1585),
('is', 1301),
('at', 1279),
('by', 1247),
('all', 1207),
('as', 1166),
('him', 1123),
('from', 1075),
('she', 1021),
('or', 1004),
('they', 981),
('be', 846),
('not', 840),
('my', 815),
('had', 790),
('out', 782),
('what', 762),
('their', 710),
('like', 706),
('mr', 701),
('up', 674),
('have', 658),
('an', 652),
('me', 647),
('but', 644),
('one', 573),
('if', 539),
('so', 538),
('when', 531),
('there', 508),
('about', 504),
('them', 498),
('are', 498),
('were', 489),
('your', 486),
('said.', 480),
('then', 477),
('which', 474),
('old', 469),
('no', 462),
('says', 454),
('bloom', 442),
('this', 439),
('said', 428),
('who', 423),
('over', 401),
('we', 393),
('after', 388),
('down', 380),
('would', 369),
('do', 364),
('did', 358),
('two', 345),
('into', 328),
('see', 317),
('stephen', 316),
('will', 316),
('those', 311),
('off', 309),
('its', 305),
('some', 305),
('could', 297),
('our', 284),
('bloom:', 284),
('man', 283),
('little', 283),
('has', 282),
('other', 282),
('good', 276),
('where', 275),
('too', 274),
('said,', 265),
('more', 261),
('back', 256),
('time', 254),
('now', 252),
('get', 251),
('only', 243),
('know', 243),
('through', 242),
('how', 242),
('it.', 239),
('just', 238),
('never', 236)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment