Example Markov-chain-from-gutenberg-textfile implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
markov_generator_pipeline | |
~~~~~~~~~~~~~~~~~~~~~~~~~ | |
An example markov generator that reads corpus one line at a time | |
and uses numpy for storing / drawing word likelihoods | |
Example source text: | |
http://www.gutenberg.org/cache/epub/28339/pg28339.txt | |
Some fun example phrases: | |
'That election of the advocates have, until the sovereign grace, | |
and we honour their humour; who are compatible with active links | |
or mahometanism, who can commit.' | |
'We must comply with the sequestered cottage of the system of party, | |
intriguing for a charitable project gutenberg license included with | |
churchmen were present discussion.' | |
'To be misunderstood, the particular state of interminable forests.' | |
""" | |
from collections import Counter, deque | |
def clean(token): | |
return ''.join(filter(lambda v: ord(v) < 180, token)).lower() | |
def tokenize(line): | |
return deque(map(clean, line.split())) | |
def generate_tokens(line_generator): | |
last_token = '' | |
for line in line_generator: | |
tokens = tokenize(line) | |
while tokens: | |
current_token = tokens.popleft() | |
yield (last_token, current_token) | |
last_token = current_token | |
def assemble_counts(token_generator): | |
bigram_counts = Counter() | |
word_counts = Counter() | |
start_words = set() | |
for token_pair in token_generator: | |
bigram_counts[token_pair] += 1 | |
word_counts[token_pair[0]] += 1 | |
if token_pair[0].endswith('.'): | |
start_words.add(token_pair[1]) | |
return bigram_counts, word_counts, start_words | |
class MarkovGenerator(object): | |
def __init__(self, bigram_counts, word_counts, start_words): | |
self.bigram_counts = bigram_counts | |
self.word_counts = word_counts | |
self.word_list = word_counts.keys() | |
self.start_words = start_words | |
self._do_math() | |
def draw(self): | |
tokens = [] | |
word_idx = int(np.random.multinomial( | |
1, self.initial_state_prob_vec | |
).argmax()) | |
word = self.word_list[word_idx] | |
while not word.endswith('.'): | |
tokens.append(word) | |
word_idx = int(np.random.multinomial( | |
1, self.transition_matrix[word_idx, :] | |
).argmax()) | |
word = self.word_list[word_idx] | |
tokens.append(word) | |
return ' '.join(tokens).capitalize() | |
def _do_math(self): | |
self.total_start_words = float(sum([ | |
count | |
for word, count in self.word_counts.iteritems() | |
if word in self.start_words])) | |
self.initial_state_prob_vec = np.array( | |
[ | |
((self.word_counts[w] if w in self.start_words else 0.0) | |
/ self.total_start_words) | |
for w in self.word_list] | |
) | |
self.transition_matrix = np.vstack( | |
[np.array( | |
[self.bigram_counts[(w1, w2)] / float(self.word_counts[w1]) | |
for w2 in self.word_list] | |
) | |
for w1 in self.word_list | |
] | |
) | |
def get_chain_from_file(filename): | |
with open(filename, 'r') as f: | |
MarkovGenerator(*assemble_counts(generate_tokens(f))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment