Skip to content

Instantly share code, notes, and snippets.

@hahastudio
Last active August 29, 2015 14:04
Show Gist options
  • Save hahastudio/dcfc9ec8d2bc7e548aee to your computer and use it in GitHub Desktop.
Save hahastudio/dcfc9ec8d2bc7e548aee to your computer and use it in GitHub Desktop.
A simple MapReduce model in Python, introducing the concept of MapReduce in word count problem
from itertools import groupby
# A sample input of a word count problem
source = ["Here is the first line in this source",
"And Here is the second line in this source",
"Welcome to the third line in this source"]
# map stage
map_result = map(lambda line: [(word.lower(), 1) for word in line.split()], source)
# [[('here', 1), ('is', 1), ('the', 1), ('first', 1),
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)],
# [('and', 1), ('here', 1), ('is', 1), ('the', 1), ('second', 1),
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)],
# [('welcome', 1), ('to', 1), ('the', 1), ('third', 1),
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)]]
#combine stage
combine_result = sorted([key_val for key_values in map_result for key_val in key_values])
# [('and', 1),
# ('first', 1),
# ('here', 1), ('here', 1),
# ('in', 1), ('in', 1), ('in', 1),
# ('is', 1), ('is', 1),
# ('line', 1), ('line', 1), ('line', 1),
# ('second', 1),
# ('source', 1), ('source', 1), ('source', 1),
# ('the', 1), ('the', 1), ('the', 1),
# ('third', 1),
# ('this', 1), ('this', 1), ('this', 1),
# ('to', 1),
# ('welcome', 1)]
#reduce stage
reduce_result = [(key, reduce(lambda x,y:x+y, (v for k, v in key_values)))
for key, key_values in groupby(combine_result, lambda kv: kv[0])]
# [('and', 1),
# ('first', 1),
# ('here', 2),
# ('in', 3),
# ('is', 2),
# ('line', 3),
# ('second', 1),
# ('source', 3),
# ('the', 3),
# ('third', 1),
# ('this', 3),
# ('to', 1),
# ('welcome', 1)]
# And a single line of this!
result = [(key, reduce(lambda x,y:x+y, (v for k, v in key_values)))
for key, key_values in groupby(
sorted([key_val for key_values in
map(lambda line: [(word.lower(), 1) for word in line.split()], source)
for key_val in key_values]), lambda kv: kv[0])]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment