snowraptor/markovgen.py

## markovgen.py
#!/usr/bin/python
import random
from sys import argv, stdin, stderr,exit


__doc__ ="""Generate text of a given length based on markov chains of existing data.

Usage: markovgen.py [filename [depth [size]]]

All parameters are optional. If no parameters are given or filename = '-'
the standard input is read as data source.

depth is the number of words to use as current state in the Markov chain:
the larger it is, less random gibberish and more similar to the original data
the output will be. DEFAULT = 2

size is the number of words to be generated. DEFAULT = 100

Output is written to standard output.
"""

class Markov(object):

    def __init__(self, open_file, depth = 2):
        self.cache = {}
        self.depth = depth
        self.open_file = open_file
        self.words = self.file_to_words()
        self.word_size = len(self.words)
        self.database()


    def file_to_words(self):
        """ Read file and split it, keeping linebreaks"""
        self.open_file.seek(0)
        words = []
        data = self.open_file.read()
        words = data.split()
        return words


    def nuples(self):
        """ Generate n-uples from the given data string.
            So if our string were "What a lovely day" and depth = 2,
            we'd generate (What, a, lovely) and then (a, lovely, day).
        """
        if len(self.words) < self.depth + 1:
            return

        for i in range(len(self.words) - self.depth):
            yield tuple(self.words[i:i+self.depth + 1])

    def database(self):
        """ Build the markov chain.
            self.cache is a dictionary with keys being tuples
            of length self.size and values as a list of words that follow
            the key sequence in the data.
        """
        for nuple in self.nuples():
            key = nuple[:-1]
            if key in self.cache:
                self.cache[key].append(nuple[-1])
            else:
                self.cache[key] = [nuple[-1]]

    def generate_markov_text(self, size=100):
        """ Generate text with given size avoiding telltale signs """
        tolerance = 0.2
        punctuation = ('.','!','?')
        seed = random.randint(0, self.word_size-(self.depth + 1))
        # Make sure that first word is captalized
        while not self.words[seed].istitle():
            seed = (seed + 1) % seed
        seed_words = self.words[seed:seed + self.depth]
        w = seed_words
        gen_words = []
        for i in xrange(size):
            gen_words.append(w[0])
            try:
                w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
            except KeyError:
                break
        # Make sure that last word ends a sentence, but avoid going too far
        while not w[-1][-1] in punctuation and i < size * (1 + tolerance):
            gen_words.append(w[0])
            try:
                w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
            except KeyError:
                break
            i += 1

        gen_words.extend(w[1:])
        return " ".join(gen_words)


try:
    filename = argv[1]
except IndexError:
    open_file = stdin

try:
    depth = int(argv[2])
except IndexError:
    depth = 25

try:
    size = int(argv[3])
except IndexError:
    size = 100

if filename == '-':
    m = Markov(stdin, depth)
else:
    try:
        open_file = open(filename,'r')
    except IOError as e:
        print >> stderr, __doc__
        print >> stderr, "File {} could not be opened. Bailing out.".format(filename)
        exit(1)
    else:
        m = Markov(open_file, depth)
        open_file.close()

print m.generate_markov_text(size)
	#!/usr/bin/python
	import random
	from sys import argv, stdin, stderr,exit


	__doc__ ="""Generate text of a given length based on markov chains of existing data.

	Usage: markovgen.py [filename [depth [size]]]

	All parameters are optional. If no parameters are given or filename = '-'
	the standard input is read as data source.

	depth is the number of words to use as current state in the Markov chain:
	the larger it is, less random gibberish and more similar to the original data
	the output will be. DEFAULT = 2

	size is the number of words to be generated. DEFAULT = 100

	Output is written to standard output.
	"""

	class Markov(object):

	def __init__(self, open_file, depth = 2):
	self.cache = {}
	self.depth = depth
	self.open_file = open_file
	self.words = self.file_to_words()
	self.word_size = len(self.words)
	self.database()


	def file_to_words(self):
	""" Read file and split it, keeping linebreaks"""
	self.open_file.seek(0)
	words = []
	data = self.open_file.read()
	words = data.split()
	return words


	def nuples(self):
	""" Generate n-uples from the given data string.
	So if our string were "What a lovely day" and depth = 2,
	we'd generate (What, a, lovely) and then (a, lovely, day).
	"""
	if len(self.words) < self.depth + 1:
	return

	for i in range(len(self.words) - self.depth):
	yield tuple(self.words[i:i+self.depth + 1])

	def database(self):
	""" Build the markov chain.
	self.cache is a dictionary with keys being tuples
	of length self.size and values as a list of words that follow
	the key sequence in the data.
	"""
	for nuple in self.nuples():
	key = nuple[:-1]
	if key in self.cache:
	self.cache[key].append(nuple[-1])
	else:
	self.cache[key] = [nuple[-1]]

	def generate_markov_text(self, size=100):
	""" Generate text with given size avoiding telltale signs """
	tolerance = 0.2
	punctuation = ('.','!','?')
	seed = random.randint(0, self.word_size-(self.depth + 1))
	# Make sure that first word is captalized
	while not self.words[seed].istitle():
	seed = (seed + 1) % seed
	seed_words = self.words[seed:seed + self.depth]
	w = seed_words
	gen_words = []
	for i in xrange(size):
	gen_words.append(w[0])
	try:
	w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
	except KeyError:
	break
	# Make sure that last word ends a sentence, but avoid going too far
	while not w[-1][-1] in punctuation and i < size * (1 + tolerance):
	gen_words.append(w[0])
	try:
	w = w[1:] + [ random.choice(self.cache[tuple(w)]) ]
	except KeyError:
	break
	i += 1

	gen_words.extend(w[1:])
	return " ".join(gen_words)



	try:
	filename = argv[1]
	except IndexError:
	open_file = stdin

	try:
	depth = int(argv[2])
	except IndexError:
	depth = 25

	try:
	size = int(argv[3])
	except IndexError:
	size = 100

	if filename == '-':
	m = Markov(stdin, depth)
	else:
	try:
	open_file = open(filename,'r')
	except IOError as e:
	print >> stderr, __doc__
	print >> stderr, "File {} could not be opened. Bailing out.".format(filename)
	exit(1)
	else:
	m = Markov(open_file, depth)
	open_file.close()

	print m.generate_markov_text(size)