Last active
December 25, 2015 06:18
-
-
Save k-fujikawa/6930674 to your computer and use it in GitHub Desktop.
Hadoop Streaming を利用した word count
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# coding: utf-8 | |
import sys, json, codecs | |
dic = json.load(codecs.open('dic.json', 'r', 'utf-8')) | |
# input comes from STDIN (standard input) | |
for line in sys.stdin: | |
# 行を単語に分割する | |
words = line.strip().split() | |
for word in words: | |
# STDOUT (標準出力)に結果を書き込む; | |
# ここで出力したものはReduce(つまりrecuder.py)での入力になる | |
# タブ文字での分割; 単語の出現回数は 1 | |
if word in dic: | |
print '%s\t%s' % (word, '1') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# coding: utf-8 | |
from operator import itemgetter | |
import sys | |
# 単語の出現回数のマップ | |
word2count = {} | |
# 入力はSTDIN | |
for line in sys.stdin: | |
# mapper.pyの出力をパースする | |
word, count = line.strip().split('\t') | |
try: | |
count = int(count) | |
word2count[word] = word2count.get(word, 0) + count | |
except ValueError: | |
pass | |
# STDOUT (標準出力)に結果を書き込む | |
for word in word2count: | |
print '%s\t%s'% (word, word2count[word]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
############################################### | |
jobname="word count" | |
hadoop_in=" ** path-to-hdfs-file ** " | |
hadoop_out=" ** path-to-hdfs-file ** " | |
############################################### | |
currentPath=`pwd`; | |
mapper=$currentPath"/mapper.py" | |
reducer=$currentPath"/reducer.py" | |
############################################### | |
options="-D mapred.reduce.tasks=30" | |
options=$options" -D mapred.job.priority=NORMAL" | |
options=$options" -file ** path-to-dic.json **" | |
############################################### | |
echo "remove hdfs file: \""$hadoop_out"\" ?[y/n]" | |
read ANS | |
if [ $ANS = 'y' -o $ANS = 'yes' ]; then | |
hadoop fs -rmr ${hadoop_out} | |
hadoop jar /home/hadoop/hadoop/contrib/streaming/hadoop-streaming-1.1.2.jar -D mapred.job.name="${jobname}" ${options} -file ${mapper} -mapper ${mapper} -file ${reducer} -reducer ${reducer} -input ${hadoop_in} -output ${hadoop_out} | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment