gatlin/markov.py

## markov.py
###
# Markov models!
#
# A Markov model of some process is a model of different states you can be in,
# where each state links to other states. These links are weighted by some
# probability score. Markov models are designed to model systems where future
# states depend only on the present state.
#
# In the case of words you may define "present state" to be "the current word,"
# or perhaps "the last 3 words."
#
# The implementations here are focused on legibility, not efficiency. Also you
# shouldn't need anything outside the standard library to run this.

import random

# Arbitrary tokens to let us mark the beginning and ending of a sentence.
SENTENCE_START = "__START__"
SENTENCE_END = "__END__"

class MarkovBuilder(object):
    """
    Builds a Markov model out of a stream of sentences.

    Note here we assume the sentences have been pre-processed in some way (no
    punctuation hanging out at the end, etc).
    """

    def __init__(self, sentences, ngram_size):
        self.ngram_size = ngram_size
        self.sentences = sentences
        self.model = None

    def build(self):
        """
        For each sentence that comes in, create ngrams of the correct size out
        of the words and add them to our model.
        """
        self.model = {}

        for sentence in self.sentences:
            # the "words" are padded with `self.ngram_size` copies of
            # `SENTENCE_START`.
            words = ([ SENTENCE_START ] * self.ngram_size) + sentence + [ SENTENCE_END ]

            for i in range(len(sentence) + 1):
                # we're storing tuples consisting of `self.ngram_size`
                # contiguous words from the sentence. This means, if
                # `self.ngram_size` = 2, that we'll store (SENTENCE_START,
                # SENTENCE_START) along with (SENTENCE_START, _some word).
                ngram = tuple(words[i:i+self.ngram_size])

                # we also grab the word after it
                follow = words[i+self.ngram_size]
                if ngram not in self.model:
                    self.model[ngram] = {}

                if follow not in self.model[ngram]:
                    self.model[ngram][follow] = 0

                # at long last we record that we saw this word follow that
                # ngram.
                self.model[ngram][follow] += 1

        return self

    def next(self, ngram):
        """
        Given an ngram, make a weighted random choice of the next ngram.
        """

        # the list of possible successors to the current state
        successors = []

        # if the weights are 4, 2, and 3 then cumulative_dist = [ 4, 6, 9 ]
        cumulative_dist = []

        for (s, w) in self.model[ngram].items():
            # jot down our successor
            successors.append(s)

            # put it in the cumulative distribution
            if len(cumulative_dist) > 0:
                cumulative_dist.append(w +cumulative_dist[-1])
            else:
                cumulative_dist.append(w)

        # random.random() picks a random number between 0 and 1. So multiply
        # the result by the actual range you want. In this case, the last of
        # the cumulative distribution (the total).
        random_number = random.random() * cumulative_dist[-1]

        # Now, then, find the index of the first element of the cumulative
        # distribution greater than the random number. The successor has the
        # same index in the `successors` list.
        selection = 0
        for i in range(len(cumulative_dist)):
            if cumulative_dist[i] > random_number:
                selection = i
                break

        return successors[selection]

    def walk(self, start=None):
        """
        Start a Markov walk. Returns a generator, which you can pass to the
        `list()` function to get result sentence.
        """

        # if we weren't given a starting point, we know one exists as a tuple
        # with self.ngram_size copies of SENTENCE_START.
        ngram = start or (SENTENCE_START,) * self.ngram_size
        while True:
            next_ngram = self.next(ngram)
            if next_ngram == SENTENCE_END:
                break
            yield next_ngram
            ngram = tuple(ngram[1:]) + (next_ngram,)

    def generate(self, start=None):
        return ' '.join(list(self.walk(start)))

def make_corpus(text):
    """
    Given some (probably pre-processed, formatted) text generate a corpus
    suitable for our Markov model.
    """

    sentences = text.split('.')
    corpus = map(lambda s: s.split(' '), sentences)
    return corpus

def readfile_simple(path):
    """
    Give a filepath, get a quick and dirty corpus out of it suitable for use
    with our model.
    """

    contents = open(path).read()
    return make_corpus(contents)
	###
	# Markov models!
	#
	# A Markov model of some process is a model of different states you can be in,
	# where each state links to other states. These links are weighted by some
	# probability score. Markov models are designed to model systems where future
	# states depend only on the present state.
	#
	# In the case of words you may define "present state" to be "the current word,"
	# or perhaps "the last 3 words."
	#
	# The implementations here are focused on legibility, not efficiency. Also you
	# shouldn't need anything outside the standard library to run this.

	import random

	# Arbitrary tokens to let us mark the beginning and ending of a sentence.
	SENTENCE_START = "__START__"
	SENTENCE_END = "__END__"

	class MarkovBuilder(object):
	"""
	Builds a Markov model out of a stream of sentences.

	Note here we assume the sentences have been pre-processed in some way (no
	punctuation hanging out at the end, etc).
	"""

	def __init__(self, sentences, ngram_size):
	self.ngram_size = ngram_size
	self.sentences = sentences
	self.model = None

	def build(self):
	"""
	For each sentence that comes in, create ngrams of the correct size out
	of the words and add them to our model.
	"""
	self.model = {}

	for sentence in self.sentences:
	# the "words" are padded with `self.ngram_size` copies of
	# `SENTENCE_START`.
	words = ([ SENTENCE_START ] * self.ngram_size) + sentence + [ SENTENCE_END ]

	for i in range(len(sentence) + 1):
	# we're storing tuples consisting of `self.ngram_size`
	# contiguous words from the sentence. This means, if
	# `self.ngram_size` = 2, that we'll store (SENTENCE_START,
	# SENTENCE_START) along with (SENTENCE_START, _some word).
	ngram = tuple(words[i:i+self.ngram_size])

	# we also grab the word after it
	follow = words[i+self.ngram_size]
	if ngram not in self.model:
	self.model[ngram] = {}

	if follow not in self.model[ngram]:
	self.model[ngram][follow] = 0

	# at long last we record that we saw this word follow that
	# ngram.
	self.model[ngram][follow] += 1

	return self

	def next(self, ngram):
	"""
	Given an ngram, make a weighted random choice of the next ngram.
	"""

	# the list of possible successors to the current state
	successors = []

	# if the weights are 4, 2, and 3 then cumulative_dist = [ 4, 6, 9 ]
	cumulative_dist = []

	for (s, w) in self.model[ngram].items():
	# jot down our successor
	successors.append(s)

	# put it in the cumulative distribution
	if len(cumulative_dist) > 0:
	cumulative_dist.append(w +cumulative_dist[-1])
	else:
	cumulative_dist.append(w)

	# random.random() picks a random number between 0 and 1. So multiply
	# the result by the actual range you want. In this case, the last of
	# the cumulative distribution (the total).
	random_number = random.random() * cumulative_dist[-1]

	# Now, then, find the index of the first element of the cumulative
	# distribution greater than the random number. The successor has the
	# same index in the `successors` list.
	selection = 0
	for i in range(len(cumulative_dist)):
	if cumulative_dist[i] > random_number:
	selection = i
	break

	return successors[selection]

	def walk(self, start=None):
	"""
	Start a Markov walk. Returns a generator, which you can pass to the
	`list()` function to get result sentence.
	"""

	# if we weren't given a starting point, we know one exists as a tuple
	# with self.ngram_size copies of SENTENCE_START.
	ngram = start or (SENTENCE_START,) * self.ngram_size
	while True:
	next_ngram = self.next(ngram)
	if next_ngram == SENTENCE_END:
	break
	yield next_ngram
	ngram = tuple(ngram[1:]) + (next_ngram,)

	def generate(self, start=None):
	return ' '.join(list(self.walk(start)))

	def make_corpus(text):
	"""
	Given some (probably pre-processed, formatted) text generate a corpus
	suitable for our Markov model.
	"""

	sentences = text.split('.')
	corpus = map(lambda s: s.split(' '), sentences)
	return corpus

	def readfile_simple(path):
	"""
	Give a filepath, get a quick and dirty corpus out of it suitable for use
	with our model.
	"""

	contents = open(path).read()
	return make_corpus(contents)