Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Hadoop Streaming を利用した word count
#!/usr/local/bin/python
# coding: utf-8
import sys, json, codecs
dic = json.load(codecs.open('dic.json', 'r', 'utf-8'))
# input comes from STDIN (standard input)
for line in sys.stdin:
# 行を単語に分割する
words = line.strip().split()
for word in words:
# STDOUT (標準出力)に結果を書き込む;
# ここで出力したものはReduce(つまりrecuder.py)での入力になる
# タブ文字での分割; 単語の出現回数は 1
if word in dic:
print '%s\t%s' % (word, '1')
#!/usr/local/bin/python
# coding: utf-8
from operator import itemgetter
import sys
# 単語の出現回数のマップ
word2count = {}
# 入力はSTDIN
for line in sys.stdin:
# mapper.pyの出力をパースする
word, count = line.strip().split('\t')
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
pass
# STDOUT (標準出力)に結果を書き込む
for word in word2count:
print '%s\t%s'% (word, word2count[word])
#!/bin/sh
###############################################
jobname="word count"
hadoop_in=" ** path-to-hdfs-file ** "
hadoop_out=" ** path-to-hdfs-file ** "
###############################################
currentPath=`pwd`;
mapper=$currentPath"/mapper.py"
reducer=$currentPath"/reducer.py"
###############################################
options="-D mapred.reduce.tasks=30"
options=$options" -D mapred.job.priority=NORMAL"
options=$options" -file ** path-to-dic.json **"
###############################################
echo "remove hdfs file: \""$hadoop_out"\" ?[y/n]"
read ANS
if [ $ANS = 'y' -o $ANS = 'yes' ]; then
hadoop fs -rmr ${hadoop_out}
hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-streaming-1.1.2.jar -D mapred.job.name="${jobname}" ${options} -file ${mapper} -mapper ${mapper} -file ${reducer} -reducer ${reducer} -input ${hadoop_in} -output ${hadoop_out}
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment