Skip to content

Instantly share code, notes, and snippets.

@gatlin
Last active March 2, 2017 19:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gatlin/0da9e4cb2c6cf92d971e7cc0eafc4b95 to your computer and use it in GitHub Desktop.
Save gatlin/0da9e4cb2c6cf92d971e7cc0eafc4b95 to your computer and use it in GitHub Desktop.
by request, a simple python markov chain builder and walker.
###
# Markov models!
#
# A Markov model of some process is a model of different states you can be in,
# where each state links to other states. These links are weighted by some
# probability score. Markov models are designed to model systems where future
# states depend only on the present state.
#
# In the case of words you may define "present state" to be "the current word,"
# or perhaps "the last 3 words."
#
# The implementations here are focused on legibility, not efficiency. Also you
# shouldn't need anything outside the standard library to run this.
import random
# Arbitrary tokens to let us mark the beginning and ending of a sentence.
SENTENCE_START = "__START__"
SENTENCE_END = "__END__"
class MarkovBuilder(object):
"""
Builds a Markov model out of a stream of sentences.
Note here we assume the sentences have been pre-processed in some way (no
punctuation hanging out at the end, etc).
"""
def __init__(self, sentences, ngram_size):
self.ngram_size = ngram_size
self.sentences = sentences
self.model = None
def build(self):
"""
For each sentence that comes in, create ngrams of the correct size out
of the words and add them to our model.
"""
self.model = {}
for sentence in self.sentences:
# the "words" are padded with `self.ngram_size` copies of
# `SENTENCE_START`.
words = ([ SENTENCE_START ] * self.ngram_size) + sentence + [ SENTENCE_END ]
for i in range(len(sentence) + 1):
# we're storing tuples consisting of `self.ngram_size`
# contiguous words from the sentence. This means, if
# `self.ngram_size` = 2, that we'll store (SENTENCE_START,
# SENTENCE_START) along with (SENTENCE_START, _some word).
ngram = tuple(words[i:i+self.ngram_size])
# we also grab the word after it
follow = words[i+self.ngram_size]
if ngram not in self.model:
self.model[ngram] = {}
if follow not in self.model[ngram]:
self.model[ngram][follow] = 0
# at long last we record that we saw this word follow that
# ngram.
self.model[ngram][follow] += 1
return self
def next(self, ngram):
"""
Given an ngram, make a weighted random choice of the next ngram.
"""
# the list of possible successors to the current state
successors = []
# if the weights are 4, 2, and 3 then cumulative_dist = [ 4, 6, 9 ]
cumulative_dist = []
for (s, w) in self.model[ngram].items():
# jot down our successor
successors.append(s)
# put it in the cumulative distribution
if len(cumulative_dist) > 0:
cumulative_dist.append(w +cumulative_dist[-1])
else:
cumulative_dist.append(w)
# random.random() picks a random number between 0 and 1. So multiply
# the result by the actual range you want. In this case, the last of
# the cumulative distribution (the total).
random_number = random.random() * cumulative_dist[-1]
# Now, then, find the index of the first element of the cumulative
# distribution greater than the random number. The successor has the
# same index in the `successors` list.
selection = 0
for i in range(len(cumulative_dist)):
if cumulative_dist[i] > random_number:
selection = i
break
return successors[selection]
def walk(self, start=None):
"""
Start a Markov walk. Returns a generator, which you can pass to the
`list()` function to get result sentence.
"""
# if we weren't given a starting point, we know one exists as a tuple
# with self.ngram_size copies of SENTENCE_START.
ngram = start or (SENTENCE_START,) * self.ngram_size
while True:
next_ngram = self.next(ngram)
if next_ngram == SENTENCE_END:
break
yield next_ngram
ngram = tuple(ngram[1:]) + (next_ngram,)
def generate(self, start=None):
return ' '.join(list(self.walk(start)))
def make_corpus(text):
"""
Given some (probably pre-processed, formatted) text generate a corpus
suitable for our Markov model.
"""
sentences = text.split('.')
corpus = map(lambda s: s.split(' '), sentences)
return corpus
def readfile_simple(path):
"""
Give a filepath, get a quick and dirty corpus out of it suitable for use
with our model.
"""
contents = open(path).read()
return make_corpus(contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment