Skip to content

Instantly share code, notes, and snippets.

@Lanny
Last active March 4, 2018 03:24
Show Gist options
  • Save Lanny/74a622a7a3db30874b565bf9fc6bf7b5 to your computer and use it in GitHub Desktop.
Save Lanny/74a622a7a3db30874b565bf9fc6bf7b5 to your computer and use it in GitHub Desktop.
import urllib2
import time
from collections import defaultdict
def run_mr(mapf, reducef, data):
intr = defaultdict(list)
final = []
def emitIntr(key, value):
intr[key].append(value)
def emitFinal(value):
final.append(value)
# INSERT PARALLELISM HERE
for key, value in data.items():
mapf(key, value, emitIntr)
for key, value in intr.items():
reducef(key, value, emitFinal)
return final
urls = {
'wikipedia': 'https://en.wikipedia.org/wiki/Jeff_Dean_(computer_scientist)',
'wikipedia': 'https://en.wikipedia.org/wiki/Sanjay_Ghemawat',
'google': 'https://research.google.com/archive/mapreduce.html',
'google': 'https://research.google.com/',
}
def mapf(key, value, emitIntr):
start = time.time()
req = urllib2.urlopen(value)
req.read()
end = time.time()
emitIntr(key, end - start)
def reducef(key, values, emitFinal):
emitFinal((key, sum(values) / float(len(values))))
print run_mr(mapf, reducef, urls)
# Sample run: [('google', 0.15115809440612793), ('wikipedia', 0.08656191825866699)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment