Last active
March 12, 2017 03:46
-
-
Save noureddin/d9c3fd68fb07c19f1bb3b34b9fc14dce to your computer and use it in GitHub Desktop.
Letters and letter combinations frequency counter in POSIX shell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
## Introduction | |
# This script calculates the frequencies of letters and letter combinations (monograms, bigrams, trigrams) in a given text. The frequency is given as the percentage of occurrences. | |
# This script is language- and script-agnostic; it should work with any script whatsoever (it's tested with Arabic). It's written in POSIX sh, and uses Unix tools, so it should work on most OSes. Please note it's case-sensitive. | |
# This script is licensed under the terms of CC0. Attribution is really appreciated but not required. | |
# Contact the author at noureddin@protonmail.com or noureddin95@gmail.com | |
# defining a function that changes the numbered output of `uniq -c | sort -nr` to percentage | |
numbered2percent() | |
{ | |
total=$(awk '{c+=$1}END{ print c}' "$1") | |
awk '{print $1*100/'$total', $2}' "$1" | |
} | |
## 0. Initialization | |
# you need to have your text in a plain text file with no punctuation or tabs at all, only the letters you want to count, spaces, and newline, and rename it 'q' | |
# have every word in a separate line and remove spaces | |
sed 's/ /\n/g' q | grep -v ' ' | grep -v ^$ > qq | |
# 1. Monograms | |
sed 's/./&\n/g' qq | grep -v ^$ | sort | uniq -c | sort -nr > monograms- | |
# change to percentage | |
numbered2percent monograms- > monograms; rm monograms- | |
## 2. Bigrams | |
# make a copy of your text file and remove the first letter of every word (line) | |
sed s/.// qq > qqx | |
# generate all the bigrams from the two files | |
for i in qq qqx; do | |
sed 's/../&\n/g' $i | grep -v ^.$ | |
done | grep -v ^$ | sort | uniq -c | sort -nr > bigrams- | |
# change to percentage | |
numbered2percent bigrams- > bigrams; rm bigrams- | |
## 3. Trigrams | |
# make a second copy from your (first-letter-truncated) text file and remove the first letter of every word (line), effectively removing the first two letters from every word from the original file | |
sed s/.// qqx > qqxx | |
# generate all the trigrams of the three files | |
for i in qq qqx qqxx; do | |
sed 's/.../&\n/g' $i | grep -v ^.$ | grep -v ^..$ | |
done | grep -v ^$ | sort | uniq -c | sort -nr > trigrams- | |
# change to percentage | |
numbered2percent trigrams- > trigrams; rm trigrams- | |
# repeat for more n-grams if you need | |
## X. Cleaning | |
rm qq qqx qqxx |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment