Instantly share code, notes, and snippets.

Embed
What would you like to do?
Generate random real-sounding words using an input text file as a basis; uses Markov chains of 3-grams to internally build the new random words.
#!/usr/local/bin/python3
#
# markov.py
#
# Reads in a text file and produces a list of random
# words with similar 3-grams to those found in the input file.
#
# Usage: ./markov.py <text_filename>
#
import bisect
import random
import sys
# Globals.
words = []
trigrams = {}
# Functions.
def add_word(word):
grams = get_grams(word)
from_gram = None
for to_gram in grams:
if from_gram: add_transition(from_gram, to_gram)
from_gram = to_gram
def get_grams(word):
if len(word) < 3: return []
grams = ['<start>']
for i in range(len(word) - 2):
grams.append(word[i:i + 3])
grams.append('<end>')
return grams
# Inverse of get_grams; expects the list to start with '<start>'
# to end with '<end>', and to have at least one trigram in between.
def get_word(grams):
chars = list(grams[1])
for i in range(2, len(grams) - 1): chars.append(grams[i][-1])
return ''.join(chars)
def add_transition(from_gram, to_gram):
global trigrams
if from_gram not in trigrams:
trigrams[from_gram] = [[[to_gram, 1]], 1]
return
# The from_gram is already a key in trigrams.
from_data = trigrams[from_gram]
from_data[1] += 1
insert_gram(from_data[0], to_gram)
# Adds new_gram to the weighted, sorted list of grams.
def insert_gram(grams, new_gram):
i = bisect.bisect_left(grams, [new_gram])
if i == len(grams) or grams[i][0] != new_gram:
# New gram; insert it.
grams.insert(i, [new_gram, 1])
else:
# Repeat gram; increment its weight.
grams[i][1] += 1
# Returns a gram chosen randomly, using weights, from the sorted grams list.
def random_gram(grams, total_weight):
#print('total_weight=', total_weight)
#print('grams=', grams)
exceed_sum = random.randrange(total_weight)
i = 0
left_sum = grams[i][1]
while True:
if left_sum > exceed_sum: return grams[i][0]
i += 1
left_sum += grams[i][1]
def random_word():
global trigrams
grams = ['<start>']
while grams[-1] != '<end>':
grams.append(random_gram(*trigrams[grams[-1]]))
return get_word(grams)
# Main.
if len(sys.argv) < 2:
print('Usage: %s <text_filename>' % sys.argv[0])
exit(2)
# Read in the words to consider.
f = open(sys.argv[1])
words = []
lines_done = 0
for line in f:
if line[0] == '#': continue
words.extend(line.strip().lower().split(' '))
lines_done += 1
# Uncomment the next line if you're reading in a ridiculously
# large file and want to see progress reports as the file is processed.
#if lines_done % 100 == 0: print('%d lines processed' % lines_done)
f.close()
# Build a markov chain of 3-grams.
#
# Here and below, 'gram' refers to a trigram = 3-gram.
# trigrams[from_gram] = [[[to_gram, weight], ..], total_weight]
#
# I'll use the strings <start> and <end> as special grams to
# denote the start and end of a word.
#
# This structure allows for almost O(log n) insertion time of a new gram,
# and O(n) lookup time to go from one gram to the next. The 'almost'
# is due to the fact that arbitrary inserts in Python lists are, as far
# as I know, linear-time operations.
for word in words: add_word(word)
for i in range(100):
print(random_word())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment