Skip to content

Instantly share code, notes, and snippets.

@tos-kamiya
Last active December 20, 2015 16:39
Show Gist options
  • Save tos-kamiya/6162987 to your computer and use it in GitHub Desktop.
Save tos-kamiya/6162987 to your computer and use it in GitHub Desktop.
study of mrjob #2
iimport re
from itertools import groupby
from mrjob.job import MRJob
WORD_RE = re.compile(r"[\w']+")
class WordCollocationCount(MRJob):
def mapper(self, _, line):
t = [word.lower() for word in WORD_RE.findall(line)]
for w1, w2 in zip(t, t[1:]):
yield w1, (w2, 1)
def combiner(self, left_word, right_word_counts):
for k, g in groupby(right_word_counts, key=lambda kv: kv[0]):
yield left_word, (k, sum(kv[1] for kv in g))
def reducer(self, left_word, right_word_counts):
for k, g in groupby(right_word_counts, key=lambda kv: kv[0]):
yield left_word, (k, sum(kv[1] for kv in g))
if __name__ == '__main__':
WordCollocationCount.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment