Skip to content

Instantly share code, notes, and snippets.

@snowraptor
Forked from agiliq/gist:131679
Created June 23, 2012 12:53
Show Gist options
  • Save snowraptor/2978200 to your computer and use it in GitHub Desktop.
Save snowraptor/2978200 to your computer and use it in GitHub Desktop.
Generate text of a given length based on markov chains of existing data.
#!/usr/bin/python
import random
from sys import argv, stdin, stderr,exit
__doc__ ="""Generate text of a given length based on markov chains of existing data.
Usage: markovgen.py [filename [depth [size]]]
All parameters are optional. If no parameters are given or filename = '-'
the standard input is read as data source.
depth is the number of words to use as current state in the Markov chain:
the larger it is, less random gibberish and more similar to the original data
the output will be. DEFAULT = 2
size is the number of words to be generated. DEFAULT = 100
Output is written to standard output.
"""
class Markov(object):
def __init__(self, open_file, depth = 2):
self.cache = {}
self.depth = depth
self.open_file = open_file
self.words = self.file_to_words()
self.word_size = len(self.words)
self.database()
def file_to_words(self):
""" Read file and split it, keeping linebreaks"""
self.open_file.seek(0)
words = []
data = self.open_file.read()
words = data.split()
return words
def nuples(self):
""" Generate n-uples from the given data string.
So if our string were "What a lovely day" and depth = 2,
we'd generate (What, a, lovely) and then (a, lovely, day).
"""
if len(self.words) < self.depth + 1:
return
for i in range(len(self.words) - self.depth):
yield tuple(self.words[i:i+self.depth + 1])
def database(self):
""" Build the markov chain.
self.cache is a dictionary with keys being tuples
of length self.size and values as a list of words that follow
the key sequence in the data.
"""
for nuple in self.nuples():
key = nuple[:-1]
if key in self.cache:
self.cache[key].append(nuple[-1])
else:
self.cache[key] = [nuple[-1]]
def generate_markov_text(self, size=100):
""" Generate text with given size avoiding telltale signs """
tolerance = 0.2
punctuation = ('.','!','?')
seed = random.randint(0, self.word_size-(self.depth + 1))
# Make sure that first word is captalized
while not self.words[seed].istitle():
seed = (seed + 1) % seed
seed_words = self.words[seed:seed + self.depth]
w = seed_words
gen_words = []
for i in xrange(size):
gen_words.append(w[0])
try:
w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
except KeyError:
break
# Make sure that last word ends a sentence, but avoid going too far
while not w[-1][-1] in punctuation and i < size * (1 + tolerance):
gen_words.append(w[0])
try:
w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
except KeyError:
break
i += 1
gen_words.extend(w[1:])
return " ".join(gen_words)
try:
filename = argv[1]
except IndexError:
open_file = stdin
try:
depth = int(argv[2])
except IndexError:
depth = 25
try:
size = int(argv[3])
except IndexError:
size = 100
if filename == '-':
m = Markov(stdin, depth)
else:
try:
open_file = open(filename,'r')
except IOError as e:
print >> stderr, __doc__
print >> stderr, "File {} could not be opened. Bailing out.".format(filename)
exit(1)
else:
m = Markov(open_file, depth)
open_file.close()
print m.generate_markov_text(size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment