tylerneylon/markov.py

## markov.py
#!/usr/local/bin/python3
#
# markov.py
#
# Reads in a text file and produces a list of random
# words with similar 3-grams to those found in the input file.
#
# Usage: ./markov.py <text_filename>
#

import bisect
import random
import sys

# Globals.
words = []
trigrams = {}

# Functions.
def add_word(word):
  grams = get_grams(word)
  from_gram = None
  for to_gram in grams:
    if from_gram: add_transition(from_gram, to_gram)
    from_gram = to_gram

def get_grams(word):
  if len(word) < 3: return []
  grams = ['<start>']
  for i in range(len(word) - 2):
    grams.append(word[i:i + 3])
  grams.append('<end>')
  return grams

# Inverse of get_grams; expects the list to start with '<start>'
# to end with '<end>', and to have at least one trigram in between.
def get_word(grams):
  chars = list(grams[1])
  for i in range(2, len(grams) - 1): chars.append(grams[i][-1])
  return ''.join(chars)

def add_transition(from_gram, to_gram):
  global trigrams
  if from_gram not in trigrams:
    trigrams[from_gram] = [[[to_gram, 1]], 1]
    return
  # The from_gram is already a key in trigrams.
  from_data = trigrams[from_gram]
  from_data[1] += 1
  insert_gram(from_data[0], to_gram)

# Adds new_gram to the weighted, sorted list of grams.
def insert_gram(grams, new_gram):
  i = bisect.bisect_left(grams, [new_gram])
  if i == len(grams) or grams[i][0] != new_gram:
    # New gram; insert it.
    grams.insert(i, [new_gram, 1])
  else:
    # Repeat gram; increment its weight.
    grams[i][1] += 1

# Returns a gram chosen randomly, using weights, from the sorted grams list.
def random_gram(grams, total_weight):
  #print('total_weight=', total_weight)
  #print('grams=', grams)
  exceed_sum = random.randrange(total_weight)
  i = 0
  left_sum = grams[i][1]
  while True:
    if left_sum > exceed_sum: return grams[i][0]
    i += 1
    left_sum += grams[i][1]

def random_word():
  global trigrams
  grams = ['<start>']
  while grams[-1] != '<end>':
    grams.append(random_gram(*trigrams[grams[-1]]))
  return get_word(grams)

# Main.
if len(sys.argv) < 2:
  print('Usage: %s <text_filename>' % sys.argv[0])
  exit(2)

# Read in the words to consider.
f = open(sys.argv[1])
words = []
lines_done = 0
for line in f:
  if line[0] == '#': continue
  words.extend(line.strip().lower().split(' '))
  lines_done += 1
  # Uncomment the next line if you're reading in a ridiculously
  # large file and want to see progress reports as the file is processed.
  #if lines_done % 100 == 0: print('%d lines processed' % lines_done)
f.close()

# Build a markov chain of 3-grams.
#
# Here and below, 'gram' refers to a trigram = 3-gram.
# trigrams[from_gram] = [[[to_gram, weight], ..], total_weight]
#
# I'll use the strings <start> and <end> as special grams to
# denote the start and end of a word.
#
# This structure allows for almost O(log n) insertion time of a new gram,
# and O(n) lookup time to go from one gram to the next. The 'almost'
# is due to the fact that arbitrary inserts in Python lists are, as far
# as I know, linear-time operations.
for word in words: add_word(word)

for i in range(100):
  print(random_word())
	#!/usr/local/bin/python3
	#
	# markov.py
	#
	# Reads in a text file and produces a list of random
	# words with similar 3-grams to those found in the input file.
	#
	# Usage: ./markov.py <text_filename>
	#

	import bisect
	import random
	import sys

	# Globals.
	words = []
	trigrams = {}

	# Functions.
	def add_word(word):
	grams = get_grams(word)
	from_gram = None
	for to_gram in grams:
	if from_gram: add_transition(from_gram, to_gram)
	from_gram = to_gram

	def get_grams(word):
	if len(word) < 3: return []
	grams = ['<start>']
	for i in range(len(word) - 2):
	grams.append(word[i:i + 3])
	grams.append('<end>')
	return grams

	# Inverse of get_grams; expects the list to start with '<start>'
	# to end with '<end>', and to have at least one trigram in between.
	def get_word(grams):
	chars = list(grams[1])
	for i in range(2, len(grams) - 1): chars.append(grams[i][-1])
	return ''.join(chars)

	def add_transition(from_gram, to_gram):
	global trigrams
	if from_gram not in trigrams:
	trigrams[from_gram] = [[[to_gram, 1]], 1]
	return
	# The from_gram is already a key in trigrams.
	from_data = trigrams[from_gram]
	from_data[1] += 1
	insert_gram(from_data[0], to_gram)

	# Adds new_gram to the weighted, sorted list of grams.
	def insert_gram(grams, new_gram):
	i = bisect.bisect_left(grams, [new_gram])
	if i == len(grams) or grams[i][0] != new_gram:
	# New gram; insert it.
	grams.insert(i, [new_gram, 1])
	else:
	# Repeat gram; increment its weight.
	grams[i][1] += 1

	# Returns a gram chosen randomly, using weights, from the sorted grams list.
	def random_gram(grams, total_weight):
	#print('total_weight=', total_weight)
	#print('grams=', grams)
	exceed_sum = random.randrange(total_weight)
	i = 0
	left_sum = grams[i][1]
	while True:
	if left_sum > exceed_sum: return grams[i][0]
	i += 1
	left_sum += grams[i][1]

	def random_word():
	global trigrams
	grams = ['<start>']
	while grams[-1] != '<end>':
	grams.append(random_gram(*trigrams[grams[-1]]))
	return get_word(grams)

	# Main.
	if len(sys.argv) < 2:
	print('Usage: %s <text_filename>' % sys.argv[0])
	exit(2)

	# Read in the words to consider.
	f = open(sys.argv[1])
	words = []
	lines_done = 0
	for line in f:
	if line[0] == '#': continue
	words.extend(line.strip().lower().split(' '))
	lines_done += 1
	# Uncomment the next line if you're reading in a ridiculously
	# large file and want to see progress reports as the file is processed.
	#if lines_done % 100 == 0: print('%d lines processed' % lines_done)
	f.close()

	# Build a markov chain of 3-grams.
	#
	# Here and below, 'gram' refers to a trigram = 3-gram.
	# trigrams[from_gram] = [[[to_gram, weight], ..], total_weight]
	#
	# I'll use the strings <start> and <end> as special grams to
	# denote the start and end of a word.
	#
	# This structure allows for almost O(log n) insertion time of a new gram,
	# and O(n) lookup time to go from one gram to the next. The 'almost'
	# is due to the fact that arbitrary inserts in Python lists are, as far
	# as I know, linear-time operations.
	for word in words: add_word(word)

	for i in range(100):
	print(random_word())