Skip to content

Instantly share code, notes, and snippets.

@hansode
Created February 14, 2012 09:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hansode/1825425 to your computer and use it in GitHub Desktop.
Save hansode/1825425 to your computer and use it in GitHub Desktop.
MapReduce.sh
#!/bin/bash
LANG=C
LC_ALL=C
set -e
abs_path=$(cd $(dirname $0) && pwd)
input_dir=${input_dir:-${abs_path}/input.d}
output_dir=${output_dir:-${abs_path}/output.d}
dist_id=${dist_id:-1}
dist_num=${dist_num:-1}
[ -d ${input_dir} ] || {
echo no such directory: ${input_dir} >&2
exit 1
}
[ -d ${output_dir} ] || mkdir -p ${output_dir}
function list_file() {
ls ${input_dir} | egrep '.txt$'
}
total=$(list_file | wc -l)
cat <<EOS
total:${total}
dist_num:${dist_num}
----
EOS
range=$((${total} / ${dist_num}))
from=
to=
cur=0
limit=$((${dist_num} - 1))
while [ ${cur} -le ${limit} ]; do
if [ ${cur} = $((${dist_id} - 1)) ]; then
#echo -n "${cur}: "
case ${cur} in
${limit})
from=$((${range} * ${cur} + 1))
to=${total}
;;
*)
from=$((${range} * ${cur} + 1))
to=$((${range} * $((${cur}+1))))
;;
esac
echo "[${cur}]: ${from} - ${to}"
fi
cur=$((${cur} + 1))
done
list_file | sed -n -e "${from},${to}p" | while read text_file; do
echo ... ${text_file} >&2
${abs_path}/mapper.sh < ${input_dir}/${text_file} | ${abs_path}/reducer.sh
done | ${abs_path}/merger.sh > ${output_dir}/result.idx
#!/bin/bash
LANG=C
LC_ALL=C
set -e
abs_path=$(cd $(dirname $0) && pwd)
input_dir=${input_dir:-${abs_path}/input.d}
output_dir=${output_dir:-${abs_path}/output.d}
[ -d ${input_dir} ] || {
echo no such directory: ${input_dir} >&2
exit 1
}
[ -d ${output_dir} ] || mkdir -p ${output_dir}
function list_file() {
ls ${input_dir} | egrep '.txt$'
}
total=$(list_file | wc -l)
cat <<EOS
total:${total}
----
EOS
list_file | while read text_file; do
echo ... ${text_file} >&2
${abs_path}/mapper.sh < ${input_dir}/${text_file} | ${abs_path}/reducer.sh
done | ${abs_path}/merger.sh > ${output_dir}/result.idx
all: prepare
prepare: RFC-all.tar.gz rfc/rfc1.txt
rfc/rfc1.txt:
[ -d rfc ] || mkdir rfc
cd rfc && tar zxvf ../RFC-all.tar.gz
RFC-all.tar.gz:
curl --ftp-pasv ftp://ftp.rfc-editor.org/in-notes/tar/RFC-all.tar.gz -o $@
#!/bin/bash
LANG=C
LC_ALL=C
cat \
| tr -c '[:alnum:]' '[\n*]' \
| grep -v '^[0-9]*$'
#!/bin/bash
LANG=C
LC_ALL=C
cat \
| awk '
{
if (word_map[$2] == 0) {
word_map[$2] = $1
}
else {
word_map[$2] += $1
}
}
END {
for (k in word_map) {
print word_map[k], k
}
}
' \
| sort -nr
#!/bin/bash
LANG=C
LC_ALL=C
cat \
| sort \
| uniq -c \
| sort -nr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment