SolomidHero/wc.sh

## wc.sh
#!/bin/bash
# Word count of file using map reduce
# Args:
#   - file
#   - n_workers

map() {
  local file=$1
  local from=$2
  local to=$3

  sed -n "$from, $to p; $to q" $file \
  | tr '[:punct:]' ' ' \
  | tr -s ' ' '\n' \
  | sort \
  | uniq -c \
  | awk '{print $2" "$1}'
}
export -f map

reduce() {
  awk 'NF { a[$1] += $2 } END { for (i in a) print i, a[i] }'
}
export -f reduce

# read arguments, define parameters
input_file=$1
n_workers=$2
files=(`wc -l $input_file`)
n_lines=${files[0]}

# create arguments for map function (to parallelize it)
segment_size=$(($n_lines / $n_workers))
end=0
map_args=()

# fill array with (file, segment_left, segment_right)
for i in $(seq 1 $n_workers); do
  if (($i <= $n_lines % $n_workers)); then
    addition=1; else addition=0;
  fi

  start=$(($end+1))
  end=$(($end + $segment_size + $addition))
  map_args+=($input_file $start $end)
done

# parallelize map and reduce, then use single reduce with sorting
echo ${map_args[@]} \
| xargs -P $n_workers -n 3 bash -c 'map "$@" | reduce' _ \
| reduce \
| sort -k2nr -k1
	#!/bin/bash
	# Word count of file using map reduce
	# Args:
	# - file
	# - n_workers

	map() {
	local file=$1
	local from=$2
	local to=$3

	sed -n "$from, $to p; $to q" $file \
	\| tr '[:punct:]' ' ' \
	\| tr -s ' ' '\n' \
	\| sort \
	\| uniq -c \
	\| awk '{print $2" "$1}'
	}
	export -f map

	reduce() {
	awk 'NF { a[$1] += $2 } END { for (i in a) print i, a[i] }'
	}
	export -f reduce

	# read arguments, define parameters
	input_file=$1
	n_workers=$2
	files=(`wc -l $input_file`)
	n_lines=${files[0]}

	# create arguments for map function (to parallelize it)
	segment_size=$(($n_lines / $n_workers))
	end=0
	map_args=()

	# fill array with (file, segment_left, segment_right)
	for i in $(seq 1 $n_workers); do
	if (($i <= $n_lines % $n_workers)); then
	addition=1; else addition=0;
	fi

	start=$(($end+1))
	end=$(($end + $segment_size + $addition))
	map_args+=($input_file $start $end)
	done

	# parallelize map and reduce, then use single reduce with sorting
	echo ${map_args[@]} \
	\| xargs -P $n_workers -n 3 bash -c 'map "$@" \| reduce' _ \
	\| reduce \
	\| sort -k2nr -k1