Skip to content

Instantly share code, notes, and snippets.

@rmascarenhas
Created September 16, 2012 23:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rmascarenhas/3734767 to your computer and use it in GitHub Desktop.
Save rmascarenhas/3734767 to your computer and use it in GitHub Desktop.
Word-frequency calculator. Can be used standalone or in pipelines.
#!/usr/bin/env bash
# Word-frequency.
# Usage:
#
# $ wf [-ni] file.txt [file1.txt file2.txt ...]
#
# You can choose to display just the first [n] most frequent words.
#
# $ wf -n2 file.txt
#
# The -i switch enables case insensitivity.
#
# It can be used to count simple files:
#
# $ wf -n2 -i file.txt
#
# or in pipelines:
#
# $ find -name '*.rb' | xargs cat | wf -i
#
fail() {
echo "$1" >&2
exit 1
}
usage() {
progname="`basename $0`"
fail "Usage: $progname [-ni] file1.txt [file2.txt file3.txt ...]"
}
yellow() {
echo -e "\033[1;33m$1\033[0m"
}
process_stdin() {
while read line; do
echo "$line" >>$tmpfile
done
process
}
process_files() {
for file in $1; do
yellow "$file"
cp "$file" $tmpfile
if [[ -n $downcase ]]; then
tr A-Z a-z <"$file" >$tmpfile
fi
process
done
}
process() {
sed -e "$sed_src" "$tmpfile" | sort | uniq -c | sort -k1nr | head -n$limit
echo
}
progname=`basename "$0"`
limit=-0
while getopts 'n:i' option; do
case $option in
n)
let "limit = $OPTARG"
;;
i)
let "downcase = y"
;;
esac
done
shift $((OPTIND - 1))
# Remove punctuations, extra white spaces, and insert line breaks between words
sed_src='
s/[[:punct:]]/ /g
s/ / /g
s/^ *//
s/ *$//
s/ */ /g
/^ *$/d
s/ /\n/g
'
tmpfile="/tmp/__wf_$$"
if [[ $# = 0 ]]; then
process_stdin
else
process_files $*
fi
rm $tmpfile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment