noureddin/frequency-counter.sh

## frequency-counter.sh
#!/bin/sh

## Introduction
# This script calculates the frequencies of letters and letter combinations (monograms, bigrams, trigrams) in a given text. The frequency is given as the percentage of occurrences.
# This script is language- and script-agnostic; it should work with any script whatsoever (it's tested with Arabic). It's written in POSIX sh, and uses Unix tools, so it should work on most OSes. Please note it's case-sensitive.
# This script is licensed under the terms of CC0. Attribution is really appreciated but not required.
# Contact the author at noureddin@protonmail.com or noureddin95@gmail.com

# defining a function that changes the numbered output of `uniq -c | sort -nr` to percentage
numbered2percent()
{
  total=$(awk '{c+=$1}END{ print c}' "$1")
  awk '{print $1*100/'$total', $2}'  "$1"
}

## 0. Initialization
# you need to have your text in a plain text file with no punctuation or tabs at all, only the letters you want to count, spaces, and newline, and rename it 'q'

# have every word in a separate line and remove spaces
sed 's/ /\n/g' q | grep -v ' ' | grep -v ^$ > qq

# 1. Monograms
sed 's/./&\n/g' qq | grep -v ^$ | sort | uniq -c | sort -nr > monograms-
# change to percentage
numbered2percent monograms- > monograms; rm monograms-

## 2. Bigrams
# make a copy of your text file and remove the first letter of every word (line)
sed s/.// qq > qqx
# generate all the bigrams from the two files
for i in qq qqx; do
  sed 's/../&\n/g' $i | grep -v ^.$
done | grep -v ^$ | sort | uniq -c | sort -nr > bigrams-
# change to percentage
numbered2percent bigrams- > bigrams; rm bigrams-

## 3. Trigrams
# make a second copy from your (first-letter-truncated) text file and remove the first letter of every word (line), effectively removing the first two letters from every word from the original file
sed s/.// qqx > qqxx
# generate all the trigrams of the three files
for i in qq qqx qqxx; do
  sed 's/.../&\n/g' $i | grep -v ^.$ | grep -v ^..$
done | grep -v ^$ | sort | uniq -c | sort -nr > trigrams-
# change to percentage
numbered2percent trigrams- > trigrams; rm trigrams-

# repeat for more n-grams if you need

## X. Cleaning
rm qq qqx qqxx
	#!/bin/sh

	## Introduction
	# This script calculates the frequencies of letters and letter combinations (monograms, bigrams, trigrams) in a given text. The frequency is given as the percentage of occurrences.
	# This script is language- and script-agnostic; it should work with any script whatsoever (it's tested with Arabic). It's written in POSIX sh, and uses Unix tools, so it should work on most OSes. Please note it's case-sensitive.
	# This script is licensed under the terms of CC0. Attribution is really appreciated but not required.
	# Contact the author at noureddin@protonmail.com or noureddin95@gmail.com

	# defining a function that changes the numbered output of `uniq -c \| sort -nr` to percentage
	numbered2percent()
	{
	total=$(awk '{c+=$1}END{ print c}' "$1")
	awk '{print $1*100/'$total', $2}' "$1"
	}

	## 0. Initialization
	# you need to have your text in a plain text file with no punctuation or tabs at all, only the letters you want to count, spaces, and newline, and rename it 'q'

	# have every word in a separate line and remove spaces
	sed 's/ /\n/g' q \| grep -v ' ' \| grep -v ^$ > qq

	# 1. Monograms
	sed 's/./&\n/g' qq \| grep -v ^$ \| sort \| uniq -c \| sort -nr > monograms-
	# change to percentage
	numbered2percent monograms- > monograms; rm monograms-

	## 2. Bigrams
	# make a copy of your text file and remove the first letter of every word (line)
	sed s/.// qq > qqx
	# generate all the bigrams from the two files
	for i in qq qqx; do
	sed 's/../&\n/g' $i \| grep -v ^.$
	done \| grep -v ^$ \| sort \| uniq -c \| sort -nr > bigrams-
	# change to percentage
	numbered2percent bigrams- > bigrams; rm bigrams-

	## 3. Trigrams
	# make a second copy from your (first-letter-truncated) text file and remove the first letter of every word (line), effectively removing the first two letters from every word from the original file
	sed s/.// qqx > qqxx
	# generate all the trigrams of the three files
	for i in qq qqx qqxx; do
	sed 's/.../&\n/g' $i \| grep -v ^.$ \| grep -v ^..$
	done \| grep -v ^$ \| sort \| uniq -c \| sort -nr > trigrams-
	# change to percentage
	numbered2percent trigrams- > trigrams; rm trigrams-

	# repeat for more n-grams if you need

	## X. Cleaning
	rm qq qqx qqxx