satomacoto/count_defaultdict.py

## count_defaultdict.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from collections import defaultdict

d = defaultdict(int)

for line in sys.stdin:
    title, author, moji, rubi = line[:-1].split('¥t')
    d[moji,rubi] += 1

for (moji, rubi), v in d.iteritems():
    print "%s¥t%s¥t%d" % (moji, rubi, v)

## mapper.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys

# 標準入力
for line in sys.stdin:
    # 改行除去
    line = line[:-1]
    # 分割
    title, author, moji, rubi = line.split('\t')
    # 標準出力
    print '%s\t%s\t%s' % (moji, rubi, 1)

## reducer.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# 標準入力
for line in sys.stdin:
    # 改行除去
    line = line[:-1]

    # mapper.pyからの入力のパース
    moji, rubi, count = line.split('\t')
    word = '%s\t%s' % (moji, rubi)

    # カウントの変換
    try:
        count = int(count)
    except ValueError:
        # もし数値でなかったら行の無視
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # 標準出力
            print '%s\t%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s\t%s' % (current_word, current_count)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import sys
	from collections import defaultdict

	d = defaultdict(int)

	for line in sys.stdin:
	title, author, moji, rubi = line[:-1].split('¥t')
	d[moji,rubi] += 1

	for (moji, rubi), v in d.iteritems():
	print "%s¥t%s¥t%d" % (moji, rubi, v)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	from operator import itemgetter
	import sys

	current_word = None
	current_count = 0
	word = None

	# 標準入力
	for line in sys.stdin:
	# 改行除去
	line = line[:-1]

	# mapper.pyからの入力のパース
	moji, rubi, count = line.split('\t')
	word = '%s\t%s' % (moji, rubi)

	# カウントの変換
	try:
	count = int(count)
	except ValueError:
	# もし数値でなかったら行の無視
	continue

	# this IF-switch only works because Hadoop sorts map output
	# by key (here: word) before it is passed to the reducer
	if current_word == word:
	current_count += count
	else:
	if current_word:
	# 標準出力
	print '%s\t%s' % (current_word, current_count)
	current_count = count
	current_word = word

	# do not forget to output the last word if needed!
	if current_word == word:
	print '%s\t%s' % (current_word, current_count)