Created
September 16, 2012 23:01
-
-
Save rmascarenhas/3734767 to your computer and use it in GitHub Desktop.
Word-frequency calculator. Can be used standalone or in pipelines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Word-frequency. | |
# Usage: | |
# | |
# $ wf [-ni] file.txt [file1.txt file2.txt ...] | |
# | |
# You can choose to display just the first [n] most frequent words. | |
# | |
# $ wf -n2 file.txt | |
# | |
# The -i switch enables case insensitivity. | |
# | |
# It can be used to count simple files: | |
# | |
# $ wf -n2 -i file.txt | |
# | |
# or in pipelines: | |
# | |
# $ find -name '*.rb' | xargs cat | wf -i | |
# | |
fail() { | |
echo "$1" >&2 | |
exit 1 | |
} | |
usage() { | |
progname="`basename $0`" | |
fail "Usage: $progname [-ni] file1.txt [file2.txt file3.txt ...]" | |
} | |
yellow() { | |
echo -e "\033[1;33m$1\033[0m" | |
} | |
process_stdin() { | |
while read line; do | |
echo "$line" >>$tmpfile | |
done | |
process | |
} | |
process_files() { | |
for file in $1; do | |
yellow "$file" | |
cp "$file" $tmpfile | |
if [[ -n $downcase ]]; then | |
tr A-Z a-z <"$file" >$tmpfile | |
fi | |
process | |
done | |
} | |
process() { | |
sed -e "$sed_src" "$tmpfile" | sort | uniq -c | sort -k1nr | head -n$limit | |
echo | |
} | |
progname=`basename "$0"` | |
limit=-0 | |
while getopts 'n:i' option; do | |
case $option in | |
n) | |
let "limit = $OPTARG" | |
;; | |
i) | |
let "downcase = y" | |
;; | |
esac | |
done | |
shift $((OPTIND - 1)) | |
# Remove punctuations, extra white spaces, and insert line breaks between words | |
sed_src=' | |
s/[[:punct:]]/ /g | |
s/ / /g | |
s/^ *// | |
s/ *$// | |
s/ */ /g | |
/^ *$/d | |
s/ /\n/g | |
' | |
tmpfile="/tmp/__wf_$$" | |
if [[ $# = 0 ]]; then | |
process_stdin | |
else | |
process_files $* | |
fi | |
rm $tmpfile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment