Last active
March 2, 2017 19:28
-
-
Save gatlin/0da9e4cb2c6cf92d971e7cc0eafc4b95 to your computer and use it in GitHub Desktop.
by request, a simple python markov chain builder and walker.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
# Markov models! | |
# | |
# A Markov model of some process is a model of different states you can be in, | |
# where each state links to other states. These links are weighted by some | |
# probability score. Markov models are designed to model systems where future | |
# states depend only on the present state. | |
# | |
# In the case of words you may define "present state" to be "the current word," | |
# or perhaps "the last 3 words." | |
# | |
# The implementations here are focused on legibility, not efficiency. Also you | |
# shouldn't need anything outside the standard library to run this. | |
import random | |
# Arbitrary tokens to let us mark the beginning and ending of a sentence. | |
SENTENCE_START = "__START__" | |
SENTENCE_END = "__END__" | |
class MarkovBuilder(object): | |
""" | |
Builds a Markov model out of a stream of sentences. | |
Note here we assume the sentences have been pre-processed in some way (no | |
punctuation hanging out at the end, etc). | |
""" | |
def __init__(self, sentences, ngram_size): | |
self.ngram_size = ngram_size | |
self.sentences = sentences | |
self.model = None | |
def build(self): | |
""" | |
For each sentence that comes in, create ngrams of the correct size out | |
of the words and add them to our model. | |
""" | |
self.model = {} | |
for sentence in self.sentences: | |
# the "words" are padded with `self.ngram_size` copies of | |
# `SENTENCE_START`. | |
words = ([ SENTENCE_START ] * self.ngram_size) + sentence + [ SENTENCE_END ] | |
for i in range(len(sentence) + 1): | |
# we're storing tuples consisting of `self.ngram_size` | |
# contiguous words from the sentence. This means, if | |
# `self.ngram_size` = 2, that we'll store (SENTENCE_START, | |
# SENTENCE_START) along with (SENTENCE_START, _some word). | |
ngram = tuple(words[i:i+self.ngram_size]) | |
# we also grab the word after it | |
follow = words[i+self.ngram_size] | |
if ngram not in self.model: | |
self.model[ngram] = {} | |
if follow not in self.model[ngram]: | |
self.model[ngram][follow] = 0 | |
# at long last we record that we saw this word follow that | |
# ngram. | |
self.model[ngram][follow] += 1 | |
return self | |
def next(self, ngram): | |
""" | |
Given an ngram, make a weighted random choice of the next ngram. | |
""" | |
# the list of possible successors to the current state | |
successors = [] | |
# if the weights are 4, 2, and 3 then cumulative_dist = [ 4, 6, 9 ] | |
cumulative_dist = [] | |
for (s, w) in self.model[ngram].items(): | |
# jot down our successor | |
successors.append(s) | |
# put it in the cumulative distribution | |
if len(cumulative_dist) > 0: | |
cumulative_dist.append(w +cumulative_dist[-1]) | |
else: | |
cumulative_dist.append(w) | |
# random.random() picks a random number between 0 and 1. So multiply | |
# the result by the actual range you want. In this case, the last of | |
# the cumulative distribution (the total). | |
random_number = random.random() * cumulative_dist[-1] | |
# Now, then, find the index of the first element of the cumulative | |
# distribution greater than the random number. The successor has the | |
# same index in the `successors` list. | |
selection = 0 | |
for i in range(len(cumulative_dist)): | |
if cumulative_dist[i] > random_number: | |
selection = i | |
break | |
return successors[selection] | |
def walk(self, start=None): | |
""" | |
Start a Markov walk. Returns a generator, which you can pass to the | |
`list()` function to get result sentence. | |
""" | |
# if we weren't given a starting point, we know one exists as a tuple | |
# with self.ngram_size copies of SENTENCE_START. | |
ngram = start or (SENTENCE_START,) * self.ngram_size | |
while True: | |
next_ngram = self.next(ngram) | |
if next_ngram == SENTENCE_END: | |
break | |
yield next_ngram | |
ngram = tuple(ngram[1:]) + (next_ngram,) | |
def generate(self, start=None): | |
return ' '.join(list(self.walk(start))) | |
def make_corpus(text): | |
""" | |
Given some (probably pre-processed, formatted) text generate a corpus | |
suitable for our Markov model. | |
""" | |
sentences = text.split('.') | |
corpus = map(lambda s: s.split(' '), sentences) | |
return corpus | |
def readfile_simple(path): | |
""" | |
Give a filepath, get a quick and dirty corpus out of it suitable for use | |
with our model. | |
""" | |
contents = open(path).read() | |
return make_corpus(contents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment