Skip to content

Instantly share code, notes, and snippets.

@markrwilliams
Created August 23, 2013 02:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markrwilliams/6314922 to your computer and use it in GitHub Desktop.
Save markrwilliams/6314922 to your computer and use it in GitHub Desktop.
from __future__ import division
import random
from itertools import tee, izip_longest
from collections import defaultdict
class Occurrences(object):
def __init__(self, personal_count=0, successor_count=None):
self.personal_count = 0
self.successor_count = defaultdict(int)
if successor_count:
self.successor_count.update(successor_count)
@property
def nextrandom(self):
upper = random.uniform(0, self.personal_count)
until = 0
for word, count in self.successor_count.iteritems():
if until + count > upper:
return word
until += count
def asdict(self):
return {'personal_count': self.personal_count,
'successor_count': self.successor_count}
def __repr__(self):
return ('Occurrences(personal_count={}, '
'successor_count={})'.format(self.personal_count,
self.successor_count))
class Markov(object):
def __init__(self, probabilities=None, start_words=None):
self.probabilities = defaultdict(Occurrences)
self.start_words = set()
if probabilities:
for w, data in probabilities.iteritems():
self.probabilities[w] = Occurrences(**data)
if start_words:
self.start_words.update(start_words)
def update(self, sentence):
a, b = tee(sentence.split())
self.start_words.add(next(b))
for cur, follow in izip_longest(a, b, fillvalue=None):
o = self.probabilities[cur]
o.personal_count += 1
if follow:
o.successor_count[follow] += 1
def generate(self):
length = random.randint(2, len(self.probabilities) // 2)
firsts = list(self.start_words)
if not firsts:
return "no data yet :("
word = random.choice(firsts)
sentence = [word]
for _ in xrange(length):
word = self.probabilities[word].nextrandom
if word is None:
break
sentence.append(word)
return ' '.join(sentence)
def asdict(self):
return {'probabilities': {w: c.asdict()
for w, c in self.probabilities.iteritems()},
'start_words': list(self.start_words)}
if __name__ == '__main__':
import sys, json
m = Markov()
with open(sys.argv[1]) as f:
m.update(f.read())
print m.generate()
saved = m.asdict()
print json.dumps(saved)
print Markov(**saved)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment