k-fujikawa/mapper.py

## mapper.py
#!/usr/local/bin/python
# coding: utf-8

import sys, json, codecs
dic = json.load(codecs.open('dic.json', 'r', 'utf-8'))

# input comes from STDIN (standard input)
for line in sys.stdin:
    # 行を単語に分割する
    words = line.strip().split()
    for word in words:
        # STDOUT (標準出力)に結果を書き込む;
        # ここで出力したものはReduce(つまりrecuder.py)での入力になる
        # タブ文字での分割; 単語の出現回数は 1
        if word in dic:
            print '%s\t%s' % (word, '1')

## reducer.py
#!/usr/local/bin/python
# coding: utf-8
from operator import itemgetter
import sys

# 単語の出現回数のマップ
word2count = {}

# 入力はSTDIN
for line in sys.stdin:
    # mapper.pyの出力をパースする
    word, count = line.strip().split('\t')
    try:
        count = int(count)
        word2count[word] = word2count.get(word, 0) + count
    except ValueError:
        pass

# STDOUT (標準出力)に結果を書き込む
for word in word2count:
    print '%s\t%s'% (word, word2count[word])

## run.sh
#!/bin/sh

###############################################

jobname="word count"
hadoop_in=" ** path-to-hdfs-file ** "
hadoop_out=" ** path-to-hdfs-file ** "

###############################################

currentPath=`pwd`;
mapper=$currentPath"/mapper.py"
reducer=$currentPath"/reducer.py"

###############################################

options="-D mapred.reduce.tasks=30"
options=$options" -D mapred.job.priority=NORMAL"
options=$options" -file ** path-to-dic.json **"

###############################################

echo "remove hdfs file: \""$hadoop_out"\" ?[y/n]"
read ANS

if [ $ANS = 'y' -o $ANS = 'yes' ]; then
  hadoop fs -rmr ${hadoop_out}
  hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-streaming-1.1.2.jar  -D mapred.job.name="${jobname}" ${options} -file ${mapper} -mapper ${mapper} -file ${reducer} -reducer ${reducer} -input ${hadoop_in} -output ${hadoop_out}
fi
	#!/usr/local/bin/python
	# coding: utf-8

	import sys, json, codecs
	dic = json.load(codecs.open('dic.json', 'r', 'utf-8'))

	# input comes from STDIN (standard input)
	for line in sys.stdin:
	# 行を単語に分割する
	words = line.strip().split()
	for word in words:
	# STDOUT (標準出力)に結果を書き込む;
	# ここで出力したものはReduce(つまりrecuder.py)での入力になる
	# タブ文字での分割; 単語の出現回数は 1
	if word in dic:
	print '%s\t%s' % (word, '1')
	#!/usr/local/bin/python
	# coding: utf-8
	from operator import itemgetter
	import sys

	# 単語の出現回数のマップ
	word2count = {}

	# 入力はSTDIN
	for line in sys.stdin:
	# mapper.pyの出力をパースする
	word, count = line.strip().split('\t')
	try:
	count = int(count)
	word2count[word] = word2count.get(word, 0) + count
	except ValueError:
	pass

	# STDOUT (標準出力)に結果を書き込む
	for word in word2count:
	print '%s\t%s'% (word, word2count[word])
	#!/bin/sh

	###############################################

	jobname="word count"
	hadoop_in=" path-to-hdfs-file "
	hadoop_out=" path-to-hdfs-file "

	###############################################

	currentPath=`pwd`;
	mapper=$currentPath"/mapper.py"
	reducer=$currentPath"/reducer.py"

	###############################################

	options="-D mapred.reduce.tasks=30"
	options=$options" -D mapred.job.priority=NORMAL"
	options=$options" -file path-to-dic.json "

	###############################################

	echo "remove hdfs file: \""$hadoop_out"\" ?[y/n]"
	read ANS

	if [ $ANS = 'y' -o $ANS = 'yes' ]; then
	hadoop fs -rmr ${hadoop_out}
	hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-streaming-1.1.2.jar -D mapred.job.name="${jobname}" ${options} -file ${mapper} -mapper ${mapper} -file ${reducer} -reducer ${reducer} -input ${hadoop_in} -output ${hadoop_out}
	fi