mumbleskates/languagemodel.py

## languagemodel.py
# coding=utf-8
from collections import Counter, defaultdict
from functools import reduce
from itertools import tee
import re


LOW_PROBABILITY = 1.0 / (1 << 20)


class LanguageModel(object):
    def __init__(self, words=(), ngram_length=3):
        self.ngram_length = ngram_length
        self.table = defaultdict(Counter)
        self.add_words(words)

    @staticmethod
    def words_to_ngrams(words, ngram_length):
        its = tee(words, ngram_length)

        # advance iterators
        try:
            for i in range(1, ngram_length):
                next(zip(*its[i:]))
        except StopIteration:
            return

        yield from zip(*its)

    def add_ngram(self, gram):
        self.table[gram[:-1]][gram[-1]] += 1

    def add_ngrams(self, ngrams):
        for gram in ngrams:
            self.add_ngram(gram)

    def add_words(self, words):
        self.add_ngrams(LanguageModel.words_to_ngrams(
            words, self.ngram_length
        ))

    def ngram_count(self, gram):
        return self.table[gram[:-1]][gram[-1]]

    def leading_ngram_count(self, leading_ngram):
        return sum(self.table[leading_ngram].values())

    def normalized_ngram_count(self, gram):
        frequency = self.ngram_count(gram)
        total = self.leading_ngram_count(gram[:-1])
        return 0 if total == 0 else frequency / total

    def sentence_probability(self, words):
        return reduce(
            lambda r, x: r * (x or LOW_PROBABILITY),
            (
                self.normalized_ngram_count(*gram) or LOW_PROBABILITY
                for gram in LanguageModel.words_to_ngrams(
                    words, self.ngram_length
                )
            ),
            1.0
        )


def corpora_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return re.split(r"\*{30,}", file.read())


def corpus_to_words(corpus, splitter):
    return splitter(corpus)


def load_corpora(filename, splitter=str.split):
    return [
        LanguageModel(corpus_to_words(words, splitter))
        for words in corpora_from_file(filename)
    ]


if __name__ == "__main__":
    lms = load_corpora("corpora.txt")
    print([lm.sentence_probability(str.split("how do you do")) for lm in lms])
	# coding=utf-8
	from collections import Counter, defaultdict
	from functools import reduce
	from itertools import tee
	import re


	LOW_PROBABILITY = 1.0 / (1 << 20)


	class LanguageModel(object):
	def __init__(self, words=(), ngram_length=3):
	self.ngram_length = ngram_length
	self.table = defaultdict(Counter)
	self.add_words(words)

	@staticmethod
	def words_to_ngrams(words, ngram_length):
	its = tee(words, ngram_length)

	# advance iterators
	try:
	for i in range(1, ngram_length):
	next(zip(*its[i:]))
	except StopIteration:
	return

	yield from zip(*its)

	def add_ngram(self, gram):
	self.table[gram[:-1]][gram[-1]] += 1

	def add_ngrams(self, ngrams):
	for gram in ngrams:
	self.add_ngram(gram)

	def add_words(self, words):
	self.add_ngrams(LanguageModel.words_to_ngrams(
	words, self.ngram_length
	))

	def ngram_count(self, gram):
	return self.table[gram[:-1]][gram[-1]]

	def leading_ngram_count(self, leading_ngram):
	return sum(self.table[leading_ngram].values())

	def normalized_ngram_count(self, gram):
	frequency = self.ngram_count(gram)
	total = self.leading_ngram_count(gram[:-1])
	return 0 if total == 0 else frequency / total

	def sentence_probability(self, words):
	return reduce(
	lambda r, x: r * (x or LOW_PROBABILITY),
	(
	self.normalized_ngram_count(*gram) or LOW_PROBABILITY
	for gram in LanguageModel.words_to_ngrams(
	words, self.ngram_length
	)
	),
	1.0
	)


	def corpora_from_file(filename):
	with open(filename, 'r', encoding='utf-8') as file:
	return re.split(r"\*{30,}", file.read())


	def corpus_to_words(corpus, splitter):
	return splitter(corpus)


	def load_corpora(filename, splitter=str.split):
	return [
	LanguageModel(corpus_to_words(words, splitter))
	for words in corpora_from_file(filename)
	]


	if __name__ == "__main__":
	lms = load_corpora("corpora.txt")
	print([lm.sentence_probability(str.split("how do you do")) for lm in lms])