public
Last active

MapReduce.sh

  • Download Gist
Makefile
Makefile
1 2 3 4 5 6 7 8 9 10
all: prepare
 
prepare: RFC-all.tar.gz rfc/rfc1.txt
 
rfc/rfc1.txt:
[ -d rfc ] || mkdir rfc
cd rfc && tar zxvf ../RFC-all.tar.gz
 
RFC-all.tar.gz:
curl --ftp-pasv ftp://ftp.rfc-editor.org/in-notes/tar/RFC-all.tar.gz -o $@
job-multi.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
#!/bin/bash
 
LANG=C
LC_ALL=C
 
set -e
abs_path=$(cd $(dirname $0) && pwd)
 
input_dir=${input_dir:-${abs_path}/input.d}
output_dir=${output_dir:-${abs_path}/output.d}
 
dist_id=${dist_id:-1}
dist_num=${dist_num:-1}
 
 
[ -d ${input_dir} ] || {
echo no such directory: ${input_dir} >&2
exit 1
}
[ -d ${output_dir} ] || mkdir -p ${output_dir}
 
 
function list_file() {
ls ${input_dir} | egrep '.txt$'
}
 
 
total=$(list_file | wc -l)
cat <<EOS
total:${total}
dist_num:${dist_num}
----
EOS
 
range=$((${total} / ${dist_num}))
from=
to=
 
cur=0
limit=$((${dist_num} - 1))
while [ ${cur} -le ${limit} ]; do
if [ ${cur} = $((${dist_id} - 1)) ]; then
#echo -n "${cur}: "
case ${cur} in
${limit})
from=$((${range} * ${cur} + 1))
to=${total}
;;
*)
from=$((${range} * ${cur} + 1))
to=$((${range} * $((${cur}+1))))
;;
esac
echo "[${cur}]: ${from} - ${to}"
fi
 
cur=$((${cur} + 1))
done
 
 
list_file | sed -n -e "${from},${to}p" | while read text_file; do
echo ... ${text_file} >&2
${abs_path}/mapper.sh < ${input_dir}/${text_file} | ${abs_path}/reducer.sh
done | ${abs_path}/merger.sh > ${output_dir}/result.idx
job-single.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
#!/bin/bash
 
LANG=C
LC_ALL=C
 
set -e
abs_path=$(cd $(dirname $0) && pwd)
 
input_dir=${input_dir:-${abs_path}/input.d}
output_dir=${output_dir:-${abs_path}/output.d}
 
 
[ -d ${input_dir} ] || {
echo no such directory: ${input_dir} >&2
exit 1
}
[ -d ${output_dir} ] || mkdir -p ${output_dir}
 
 
function list_file() {
ls ${input_dir} | egrep '.txt$'
}
 
 
total=$(list_file | wc -l)
cat <<EOS
total:${total}
----
EOS
 
list_file | while read text_file; do
echo ... ${text_file} >&2
${abs_path}/mapper.sh < ${input_dir}/${text_file} | ${abs_path}/reducer.sh
done | ${abs_path}/merger.sh > ${output_dir}/result.idx
mapper.sh
Shell
1 2 3 4 5 6 7 8
#!/bin/bash
 
LANG=C
LC_ALL=C
 
cat \
| tr -c '[:alnum:]' '[\n*]' \
| grep -v '^[0-9]*$'
merger.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#!/bin/bash
 
LANG=C
LC_ALL=C
 
cat \
| awk '
{
if (word_map[$2] == 0) {
word_map[$2] = $1
}
else {
word_map[$2] += $1
}
}
 
END {
for (k in word_map) {
print word_map[k], k
}
}
' \
| sort -nr
reducer.sh
Shell
1 2 3 4 5 6 7 8 9
#!/bin/bash
 
LANG=C
LC_ALL=C
 
cat \
| sort \
| uniq -c \
| sort -nr

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.