Skip to content

Instantly share code, notes, and snippets.

@mumbleskates
Last active March 6, 2016 02:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mumbleskates/cb9c3433422bdea98f35 to your computer and use it in GitHub Desktop.
Save mumbleskates/cb9c3433422bdea98f35 to your computer and use it in GitHub Desktop.
# coding=utf-8
from collections import Counter, defaultdict
from functools import reduce
from itertools import tee
import re
LOW_PROBABILITY = 1.0 / (1 << 20)
class LanguageModel(object):
def __init__(self, words=(), ngram_length=3):
self.ngram_length = ngram_length
self.table = defaultdict(Counter)
self.add_words(words)
@staticmethod
def words_to_ngrams(words, ngram_length):
its = tee(words, ngram_length)
# advance iterators
try:
for i in range(1, ngram_length):
next(zip(*its[i:]))
except StopIteration:
return
yield from zip(*its)
def add_ngram(self, gram):
self.table[gram[:-1]][gram[-1]] += 1
def add_ngrams(self, ngrams):
for gram in ngrams:
self.add_ngram(gram)
def add_words(self, words):
self.add_ngrams(LanguageModel.words_to_ngrams(
words, self.ngram_length
))
def ngram_count(self, gram):
return self.table[gram[:-1]][gram[-1]]
def leading_ngram_count(self, leading_ngram):
return sum(self.table[leading_ngram].values())
def normalized_ngram_count(self, gram):
frequency = self.ngram_count(gram)
total = self.leading_ngram_count(gram[:-1])
return 0 if total == 0 else frequency / total
def sentence_probability(self, words):
return reduce(
lambda r, x: r * (x or LOW_PROBABILITY),
(
self.normalized_ngram_count(*gram) or LOW_PROBABILITY
for gram in LanguageModel.words_to_ngrams(
words, self.ngram_length
)
),
1.0
)
def corpora_from_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
return re.split(r"\*{30,}", file.read())
def corpus_to_words(corpus, splitter):
return splitter(corpus)
def load_corpora(filename, splitter=str.split):
return [
LanguageModel(corpus_to_words(words, splitter))
for words in corpora_from_file(filename)
]
if __name__ == "__main__":
lms = load_corpora("corpora.txt")
print([lm.sentence_probability(str.split("how do you do")) for lm in lms])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment