Created
January 31, 2014 00:27
-
-
Save tylerneylon/8723087 to your computer and use it in GitHub Desktop.
Generate random real-sounding words using an input text file as a basis; uses Markov chains of 3-grams to internally build the new random words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# | |
# markov.py | |
# | |
# Reads in a text file and produces a list of random | |
# words with similar 3-grams to those found in the input file. | |
# | |
# Usage: ./markov.py <text_filename> | |
# | |
import bisect | |
import random | |
import sys | |
# Globals. | |
words = [] | |
trigrams = {} | |
# Functions. | |
def add_word(word): | |
grams = get_grams(word) | |
from_gram = None | |
for to_gram in grams: | |
if from_gram: add_transition(from_gram, to_gram) | |
from_gram = to_gram | |
def get_grams(word): | |
if len(word) < 3: return [] | |
grams = ['<start>'] | |
for i in range(len(word) - 2): | |
grams.append(word[i:i + 3]) | |
grams.append('<end>') | |
return grams | |
# Inverse of get_grams; expects the list to start with '<start>' | |
# to end with '<end>', and to have at least one trigram in between. | |
def get_word(grams): | |
chars = list(grams[1]) | |
for i in range(2, len(grams) - 1): chars.append(grams[i][-1]) | |
return ''.join(chars) | |
def add_transition(from_gram, to_gram): | |
global trigrams | |
if from_gram not in trigrams: | |
trigrams[from_gram] = [[[to_gram, 1]], 1] | |
return | |
# The from_gram is already a key in trigrams. | |
from_data = trigrams[from_gram] | |
from_data[1] += 1 | |
insert_gram(from_data[0], to_gram) | |
# Adds new_gram to the weighted, sorted list of grams. | |
def insert_gram(grams, new_gram): | |
i = bisect.bisect_left(grams, [new_gram]) | |
if i == len(grams) or grams[i][0] != new_gram: | |
# New gram; insert it. | |
grams.insert(i, [new_gram, 1]) | |
else: | |
# Repeat gram; increment its weight. | |
grams[i][1] += 1 | |
# Returns a gram chosen randomly, using weights, from the sorted grams list. | |
def random_gram(grams, total_weight): | |
#print('total_weight=', total_weight) | |
#print('grams=', grams) | |
exceed_sum = random.randrange(total_weight) | |
i = 0 | |
left_sum = grams[i][1] | |
while True: | |
if left_sum > exceed_sum: return grams[i][0] | |
i += 1 | |
left_sum += grams[i][1] | |
def random_word(): | |
global trigrams | |
grams = ['<start>'] | |
while grams[-1] != '<end>': | |
grams.append(random_gram(*trigrams[grams[-1]])) | |
return get_word(grams) | |
# Main. | |
if len(sys.argv) < 2: | |
print('Usage: %s <text_filename>' % sys.argv[0]) | |
exit(2) | |
# Read in the words to consider. | |
f = open(sys.argv[1]) | |
words = [] | |
lines_done = 0 | |
for line in f: | |
if line[0] == '#': continue | |
words.extend(line.strip().lower().split(' ')) | |
lines_done += 1 | |
# Uncomment the next line if you're reading in a ridiculously | |
# large file and want to see progress reports as the file is processed. | |
#if lines_done % 100 == 0: print('%d lines processed' % lines_done) | |
f.close() | |
# Build a markov chain of 3-grams. | |
# | |
# Here and below, 'gram' refers to a trigram = 3-gram. | |
# trigrams[from_gram] = [[[to_gram, weight], ..], total_weight] | |
# | |
# I'll use the strings <start> and <end> as special grams to | |
# denote the start and end of a word. | |
# | |
# This structure allows for almost O(log n) insertion time of a new gram, | |
# and O(n) lookup time to go from one gram to the next. The 'almost' | |
# is due to the fact that arbitrary inserts in Python lists are, as far | |
# as I know, linear-time operations. | |
for word in words: add_word(word) | |
for i in range(100): | |
print(random_word()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment