Skip to content

Instantly share code, notes, and snippets.

@josepdecid
Last active June 9, 2020 19:17
Show Gist options
  • Save josepdecid/d428f915d4ae1fcd89b90616898fd3b6 to your computer and use it in GitHub Desktop.
Save josepdecid/d428f915d4ae1fcd89b90616898fd3b6 to your computer and use it in GitHub Desktop.
N-gram occurences counter with MLE and LAP
from typing import List, Set, Optional
from fractions import Fraction
from collections import Counter
import termtables as tt
def count_n_grams(l: List[str], n: int, alpha: float, vocab: Optional[Set[str]] = None):
n_grams = []
i = 0
while i + n <= len(l):
n_grams.append(''.join(l[i:i + n]))
i += 1
total_count = i
counter = Counter(n_grams)
most_common = counter.most_common()
vocab_size = len(set(l)) if vocab is None else len(vocab)
nb = total_count + vocab_size ** n
n0 = vocab_size ** n - len(counter)
for i in range(len(most_common)):
mle = Fraction(most_common[i][1], total_count)
mle = f'{str(mle)} ({float(mle):.5f})'
lap = Fraction(most_common[i][1] + 1, nb)
lap = f'{str(lap)} ({float(lap):.5f})'
ld = (1 - alpha) * most_common[i][1] / total_count if most_common[i][1] > 0 else alpha / n0
ld = f'{ld:.5f}'
most_common[i] = (*most_common[i], mle, lap, ld)
most_common.append(('UNOBS', 0, 0, Fraction(1, nb), f'{alpha / n0:.5f}'))
tt.print(most_common, header=[f'{n}-gram', 'Occurences', 'MLE', 'LAP', 'LD'])
if __name__ == '__main__':
sentence = 'pituka tuika tatuk ku pika tuki pikata katuki tukapa'
# (Un)comment if case sensitive
# sentence = sentence.lower()
# Tokenize sentence (word level, char level, etc.)
#tokenized = sentence.split()
tokenized = list(sentence)
count_n_grams(tokenized, 3, 0.05, vocab={'a', 'i', 'u', 'p', 't', 'k', ' '})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment