Skip to content

Instantly share code, notes, and snippets.

@satomacoto
Created January 13, 2012 10:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satomacoto/1605518 to your computer and use it in GitHub Desktop.
Save satomacoto/1605518 to your computer and use it in GitHub Desktop.
MapReduce in Python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from collections import defaultdict
d = defaultdict(int)
for line in sys.stdin:
title, author, moji, rubi = line[:-1].split('¥t')
d[moji,rubi] += 1
for (moji, rubi), v in d.iteritems():
print "%s¥t%s¥t%d" % (moji, rubi, v)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
# 標準入力
for line in sys.stdin:
# 改行除去
line = line[:-1]
# 分割
title, author, moji, rubi = line.split('\t')
# 標準出力
print '%s\t%s\t%s' % (moji, rubi, 1)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
# 標準入力
for line in sys.stdin:
# 改行除去
line = line[:-1]
# mapper.pyからの入力のパース
moji, rubi, count = line.split('\t')
word = '%s\t%s' % (moji, rubi)
# カウントの変換
try:
count = int(count)
except ValueError:
# もし数値でなかったら行の無視
continue
# this IF-switch only works because Hadoop sorts map output
# by key (here: word) before it is passed to the reducer
if current_word == word:
current_count += count
else:
if current_word:
# 標準出力
print '%s\t%s' % (current_word, current_count)
current_count = count
current_word = word
# do not forget to output the last word if needed!
if current_word == word:
print '%s\t%s' % (current_word, current_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment