-
-
Save snowraptor/2978200 to your computer and use it in GitHub Desktop.
Generate text of a given length based on markov chains of existing data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import random | |
from sys import argv, stdin, stderr,exit | |
__doc__ ="""Generate text of a given length based on markov chains of existing data. | |
Usage: markovgen.py [filename [depth [size]]] | |
All parameters are optional. If no parameters are given or filename = '-' | |
the standard input is read as data source. | |
depth is the number of words to use as current state in the Markov chain: | |
the larger it is, less random gibberish and more similar to the original data | |
the output will be. DEFAULT = 2 | |
size is the number of words to be generated. DEFAULT = 100 | |
Output is written to standard output. | |
""" | |
class Markov(object): | |
def __init__(self, open_file, depth = 2): | |
self.cache = {} | |
self.depth = depth | |
self.open_file = open_file | |
self.words = self.file_to_words() | |
self.word_size = len(self.words) | |
self.database() | |
def file_to_words(self): | |
""" Read file and split it, keeping linebreaks""" | |
self.open_file.seek(0) | |
words = [] | |
data = self.open_file.read() | |
words = data.split() | |
return words | |
def nuples(self): | |
""" Generate n-uples from the given data string. | |
So if our string were "What a lovely day" and depth = 2, | |
we'd generate (What, a, lovely) and then (a, lovely, day). | |
""" | |
if len(self.words) < self.depth + 1: | |
return | |
for i in range(len(self.words) - self.depth): | |
yield tuple(self.words[i:i+self.depth + 1]) | |
def database(self): | |
""" Build the markov chain. | |
self.cache is a dictionary with keys being tuples | |
of length self.size and values as a list of words that follow | |
the key sequence in the data. | |
""" | |
for nuple in self.nuples(): | |
key = nuple[:-1] | |
if key in self.cache: | |
self.cache[key].append(nuple[-1]) | |
else: | |
self.cache[key] = [nuple[-1]] | |
def generate_markov_text(self, size=100): | |
""" Generate text with given size avoiding telltale signs """ | |
tolerance = 0.2 | |
punctuation = ('.','!','?') | |
seed = random.randint(0, self.word_size-(self.depth + 1)) | |
# Make sure that first word is captalized | |
while not self.words[seed].istitle(): | |
seed = (seed + 1) % seed | |
seed_words = self.words[seed:seed + self.depth] | |
w = seed_words | |
gen_words = [] | |
for i in xrange(size): | |
gen_words.append(w[0]) | |
try: | |
w = w[1:] + [ random.choice(self.cache[tuple(w)]) ] | |
except KeyError: | |
break | |
# Make sure that last word ends a sentence, but avoid going too far | |
while not w[-1][-1] in punctuation and i < size * (1 + tolerance): | |
gen_words.append(w[0]) | |
try: | |
w = w[1:] + [ random.choice(self.cache[tuple(w)]) ] | |
except KeyError: | |
break | |
i += 1 | |
gen_words.extend(w[1:]) | |
return " ".join(gen_words) | |
try: | |
filename = argv[1] | |
except IndexError: | |
open_file = stdin | |
try: | |
depth = int(argv[2]) | |
except IndexError: | |
depth = 25 | |
try: | |
size = int(argv[3]) | |
except IndexError: | |
size = 100 | |
if filename == '-': | |
m = Markov(stdin, depth) | |
else: | |
try: | |
open_file = open(filename,'r') | |
except IOError as e: | |
print >> stderr, __doc__ | |
print >> stderr, "File {} could not be opened. Bailing out.".format(filename) | |
exit(1) | |
else: | |
m = Markov(open_file, depth) | |
open_file.close() | |
print m.generate_markov_text(size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment