Skip to content

Instantly share code, notes, and snippets.

@evanmiltenburg
Forked from dellis23/markov.py
Last active August 29, 2015 14:07
Show Gist options
  • Save evanmiltenburg/cc78ffcc28557fcc6546 to your computer and use it in GitHub Desktop.
Save evanmiltenburg/cc78ffcc28557fcc6546 to your computer and use it in GitHub Desktop.
# modified from: https://gist.github.com/dellis23/6174914/
# - Added NLTK, which simplifies the chain and ngram logic.
# To use this script, you need to have downloaded the punkt
# data like this:
#
# import nltk
# nltk.download('punkt')
#
# - No more occasional KeyErrors.
# - Produces sentences rather than a string of N words.
# - Sentences now always start with a capital letter.
# - Changed I/O.
# The model now takes raw text as its input.
# Input text is separated from model initialization.
# --> As a consequence. training is also separated.
#
# Example (do this in a different file or using the python interpreter in the same directory as markov.py):
#
# from markov import Markov
# m = Markov()
# text = 'This is a text. It is just a string.\n It can be as long as you want.'
# # if you have a text file in the same directory, you could also do this:
# # with open('some_file.txt') as f:
# # text = f.read().decode('utf-8')
# m.add_text(text)
# m.train()
# generated_sentence = m.generate_markov_sentence()
import random, nltk
class Markov(object):
def __init__(self, chain_size=3):
# Increase chain length if you want to get more "typical" sentences, that are closer to the original source material.
# This does require a larger amount of texts.
self.chain_size = chain_size
self.cache = {}
self.trained = False
self.contains_text = False
self.words = []
self.word_size = 0
print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().'
def tokenize_text(self,text):
return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist]
def add_text(self,text):
self.words += self.tokenize_text(text)
self.word_size = len(self.words)
self.contains_text = True
def remove_text(self):
self.words = []
self.word_size = 0
self.contains_text = False
self.trained = False
print "The word list is empty, and the model is again untrained."
def train(self):
self.cache = {}
if not self.contains_text:
return "Please add text using model.add_text()"
for chain_set in nltk.ngrams(self.words,self.chain_size):
key = chain_set[:-1]
next_word = chain_set[-1]
if key in self.cache:
self.cache[key].append(next_word)
else:
self.cache[key] = [next_word]
self.trained = True
def initial_candidates(self):
return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()]
def generate_markov_sentence(self,limit=50):
if not self.contains_text:
return 'Please add text, and then train the model.'
if not self.trained:
return 'Please train the model first.'
gen_words = []
seed_words = random.choice(self.initial_candidates())
gen_words.extend(seed_words)
while(True):
last_word_len = self.chain_size - 1
last_words = gen_words[-1 * last_word_len:]
next_word = random.choice(self.cache[tuple(last_words)])
gen_words.append(next_word)
if next_word == '.':
return ' '.join(gen_words)
elif len(gen_words) > limit:
return ' '.join(gen_words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment