evanmiltenburg/markov.py

## markov.py
# modified from: https://gist.github.com/dellis23/6174914/

# - Added NLTK, which simplifies the chain and ngram logic.
#   To use this script, you need to have downloaded the punkt
#   data like this:
#
#   import nltk
#   nltk.download('punkt')
#
# - No more occasional KeyErrors.
# - Produces sentences rather than a string of N words.
# - Sentences now always start with a capital letter.
# - Changed I/O.
#	The model now takes raw text as its input.
#	Input text is separated from model initialization.
#	--> As a consequence. training is also separated.
#
#   Example (do this in a different file or using the python interpreter in the same directory as markov.py):
#
#   from markov import Markov
#   m = Markov()
#   text = 'This is a text. It is just a string.\n It can be as long as you want.'
#   # if you have a text file in the same directory, you could also do this:
#   # with open('some_file.txt') as f:
#   #   text = f.read().decode('utf-8')
#   m.add_text(text)
#   m.train()
#   generated_sentence = m.generate_markov_sentence()

import random, nltk

class Markov(object):

    def __init__(self, chain_size=3):
        # Increase chain length if you want to get more "typical" sentences, that are closer to the original source material.
        # This does require a larger amount of texts.
        self.chain_size = chain_size
        self.cache = {}
        self.trained = False
        self.contains_text = False
        self.words = []
        self.word_size = 0
        print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().'

    def tokenize_text(self,text):
        return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist]

    def add_text(self,text):
        self.words += self.tokenize_text(text)
        self.word_size = len(self.words)
        self.contains_text = True

    def remove_text(self):
        self.words = []
        self.word_size = 0
        self.contains_text = False
        self.trained = False
        print "The word list is empty, and the model is again untrained."

    def train(self):
        self.cache = {}
        if not self.contains_text:
            return "Please add text using model.add_text()"
        for chain_set in nltk.ngrams(self.words,self.chain_size):
            key = chain_set[:-1]
            next_word = chain_set[-1]
            if key in self.cache:
                self.cache[key].append(next_word)
            else:
                self.cache[key] = [next_word]
        self.trained = True

    def initial_candidates(self):
        return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()]

    def generate_markov_sentence(self,limit=50):
        if not self.contains_text:
            return 'Please add text, and then train the model.'
        if not self.trained:
            return 'Please train the model first.'
        gen_words = []
        seed_words = random.choice(self.initial_candidates())
        gen_words.extend(seed_words)
        while(True):
            last_word_len = self.chain_size - 1
            last_words = gen_words[-1 * last_word_len:]
            next_word = random.choice(self.cache[tuple(last_words)])
            gen_words.append(next_word)
            if next_word == '.':
                return ' '.join(gen_words)
            elif len(gen_words) > limit:
                return ' '.join(gen_words)
	# modified from: https://gist.github.com/dellis23/6174914/

	# - Added NLTK, which simplifies the chain and ngram logic.
	# To use this script, you need to have downloaded the punkt
	# data like this:
	#
	# import nltk
	# nltk.download('punkt')
	#
	# - No more occasional KeyErrors.
	# - Produces sentences rather than a string of N words.
	# - Sentences now always start with a capital letter.
	# - Changed I/O.
	# The model now takes raw text as its input.
	# Input text is separated from model initialization.
	# --> As a consequence. training is also separated.
	#
	# Example (do this in a different file or using the python interpreter in the same directory as markov.py):
	#
	# from markov import Markov
	# m = Markov()
	# text = 'This is a text. It is just a string.\n It can be as long as you want.'
	# # if you have a text file in the same directory, you could also do this:
	# # with open('some_file.txt') as f:
	# # text = f.read().decode('utf-8')
	# m.add_text(text)
	# m.train()
	# generated_sentence = m.generate_markov_sentence()

	import random, nltk

	class Markov(object):

	def __init__(self, chain_size=3):
	# Increase chain length if you want to get more "typical" sentences, that are closer to the original source material.
	# This does require a larger amount of texts.
	self.chain_size = chain_size
	self.cache = {}
	self.trained = False
	self.contains_text = False
	self.words = []
	self.word_size = 0
	print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().'

	def tokenize_text(self,text):
	return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist]

	def add_text(self,text):
	self.words += self.tokenize_text(text)
	self.word_size = len(self.words)
	self.contains_text = True

	def remove_text(self):
	self.words = []
	self.word_size = 0
	self.contains_text = False
	self.trained = False
	print "The word list is empty, and the model is again untrained."

	def train(self):
	self.cache = {}
	if not self.contains_text:
	return "Please add text using model.add_text()"
	for chain_set in nltk.ngrams(self.words,self.chain_size):
	key = chain_set[:-1]
	next_word = chain_set[-1]
	if key in self.cache:
	self.cache[key].append(next_word)
	else:
	self.cache[key] = [next_word]
	self.trained = True

	def initial_candidates(self):
	return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()]

	def generate_markov_sentence(self,limit=50):
	if not self.contains_text:
	return 'Please add text, and then train the model.'
	if not self.trained:
	return 'Please train the model first.'
	gen_words = []
	seed_words = random.choice(self.initial_candidates())
	gen_words.extend(seed_words)
	while(True):
	last_word_len = self.chain_size - 1
	last_words = gen_words[-1 * last_word_len:]
	next_word = random.choice(self.cache[tuple(last_words)])
	gen_words.append(next_word)
	if next_word == '.':
	return ' '.join(gen_words)
	elif len(gen_words) > limit:
	return ' '.join(gen_words)