ssophwang/Poet.py

## Poet.py
import re, random
import numpy as np
import json
poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt']
poem_lines = []
firstword_count_dict = {}
firstword_prob_dict = {}

for poem_file in poem_files:
    f = open(poem_file, 'r')

    for line in f:
        if len(line) > 1:
            words = re.findall('\w+', line)
            if len(words) > 1:
                poem_lines.append(words)
    f.close()

#print poem_lines

unicorns = set()
words = []
last = 0
lines = 0
for line in poem_lines:
    lines += 1
    if len(line) >= 1 and line[-1] == 'weary':
        last += 1
    for word in line:
        unicorns.add(word)
        words.append(word)

# firstword_counts = {}
#
# for line in poem_lines:
#     if len(line) > 2:
#         first_word = line[0]
#
#         if first_word in firstword_counts:
#             firstword_counts[first_word] += 1
#         else:
#             firstword_counts[first_word] = 1
#
# print firstword_counts
# words = words.replace(',', '').replace('.', ' ')
#
# word_states = re.findall('\w+', words)
#
# print word_states
# print len(set(word_states))
#
# counts_dict = {}
#
# for i in range(len(word_states)-1):
#     first_word = word_states[i]
#     next_word = word_states[i+1]
#
#     if (first_word, next_word) in counts_dict:
#         counts_dict[(first_word,next_word)] += 1
#     else:
#         counts_dict[(first_word,next_word)] = 1
#
# transition_probabilities = {}
# s = sum(counts_dict.values())
#
#
# for key in counts_dict:
#     transition_probabilities[key] = float(counts_dict[key])/s
# print transition_probabilities
# #    float(counts_dict.keys())/s
# #for x in range(10):
# #    sentence = ''
# #    for i in range(10):
# #        word = results[random.randint(0, len(results) - 1)]
# #        sentence += ' ' + word
# #    print sentence

# next_word_counts_dict = {}
# for word_list in poem_lines:
#     word_list.append('\n')
#     for i in range(len(word_list) - 1):
#         thisword = word_list[i]
#         nextword = word_list[i+1]
#
#         if thisword not in next_word_counts_dict:
#             next_word_counts_dict[thisword] = {nextword:1}
#         else:
#             if nextword not in next_word_counts_dict[thisword]:
#                 next_word_counts_dict[thisword][nextword] = 1
#             else:
#                 next_word_counts_dict[thisword][nextword] += 1

for line in poem_lines:
    if len(line) > 0:
        first_word = line[0]

        if first_word in firstword_count_dict:
            firstword_count_dict[first_word] += 1
        else:
            firstword_count_dict[first_word] = 1

count_sum = sum(firstword_count_dict.values())

for first_word in firstword_count_dict:
    firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum)

print firstword_prob_dict

next_word_counts_dict = {}
for word_list in poem_lines:
    for i in range(len(word_list)):
        thisword = word_list[i]
        if i < len(word_list)-1:
            nextword = word_list[i+1]
        else:
            nextword = '\n'

        if thisword not in next_word_counts_dict:
            next_word_counts_dict[thisword] = {nextword:1}
        else:
            if nextword not in next_word_counts_dict[thisword]:
                next_word_counts_dict[thisword][nextword] = 1
            else:
                next_word_counts_dict[thisword][nextword] += 1

for thisword in next_word_counts_dict:
    count_sum = sum(next_word_counts_dict[thisword].values())
    for nextword in next_word_counts_dict[thisword]:
        old_next_word_count = next_word_counts_dict[thisword][nextword]
        next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum)

first_states = []
pp = []

for key in firstword_prob_dict:
    first_states.append(key)
    pp.append(firstword_prob_dict[key])
print first_states
print pp

#for i in range(len(next_word_counts_dict.items)):
#print next_word_counts_dict.items()

for x in range(1000):
    state = np.random.choice(first_states, p = pp)
    sentence = [state]
    for i in range(9):
        next_states = []
        p = []
        if state in next_word_counts_dict:
            for pair in next_word_counts_dict[state].items():
                next_states.append(pair[0])
                p.append(pair[1])
            state = np.random.choice(next_states, p = p)
            sentence.append(state)
    print ' '.join([w for w in sentence if w != '\n'])


#for word in next_word_counts_dict:
#    print next_word_counts_dict[word].items()
 #   np.random.choice(word, p=)
# word, ':', next_word_counts_dict[word]
print len(next_word_counts_dict)

output_file = open('poem_model.json', "w")
output_file.write(json.dumps(next_word_counts_dict))
output_file.close()

output_file2 = open('poem_model_firstword.json', "w")
output_file2.write(json.dumps(firstword_prob_dict))
output_file2.close()
	import re, random
	import numpy as np
	import json
	poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt']
	poem_lines = []
	firstword_count_dict = {}
	firstword_prob_dict = {}

	for poem_file in poem_files:
	f = open(poem_file, 'r')

	for line in f:
	if len(line) > 1:
	words = re.findall('\w+', line)
	if len(words) > 1:
	poem_lines.append(words)
	f.close()

	#print poem_lines

	unicorns = set()
	words = []
	last = 0
	lines = 0
	for line in poem_lines:
	lines += 1
	if len(line) >= 1 and line[-1] == 'weary':
	last += 1
	for word in line:
	unicorns.add(word)
	words.append(word)

	# firstword_counts = {}
	#
	# for line in poem_lines:
	# if len(line) > 2:
	# first_word = line[0]
	#
	# if first_word in firstword_counts:
	# firstword_counts[first_word] += 1
	# else:
	# firstword_counts[first_word] = 1
	#
	# print firstword_counts
	# words = words.replace(',', '').replace('.', ' ')
	#
	# word_states = re.findall('\w+', words)
	#
	# print word_states
	# print len(set(word_states))
	#
	# counts_dict = {}
	#
	# for i in range(len(word_states)-1):
	# first_word = word_states[i]
	# next_word = word_states[i+1]
	#
	# if (first_word, next_word) in counts_dict:
	# counts_dict[(first_word,next_word)] += 1
	# else:
	# counts_dict[(first_word,next_word)] = 1
	#
	# transition_probabilities = {}
	# s = sum(counts_dict.values())
	#
	#
	# for key in counts_dict:
	# transition_probabilities[key] = float(counts_dict[key])/s
	# print transition_probabilities
	# # float(counts_dict.keys())/s
	# #for x in range(10):
	# # sentence = ''
	# # for i in range(10):
	# # word = results[random.randint(0, len(results) - 1)]
	# # sentence += ' ' + word
	# # print sentence

	# next_word_counts_dict = {}
	# for word_list in poem_lines:
	# word_list.append('\n')
	# for i in range(len(word_list) - 1):
	# thisword = word_list[i]
	# nextword = word_list[i+1]
	#
	# if thisword not in next_word_counts_dict:
	# next_word_counts_dict[thisword] = {nextword:1}
	# else:
	# if nextword not in next_word_counts_dict[thisword]:
	# next_word_counts_dict[thisword][nextword] = 1
	# else:
	# next_word_counts_dict[thisword][nextword] += 1

	for line in poem_lines:
	if len(line) > 0:
	first_word = line[0]

	if first_word in firstword_count_dict:
	firstword_count_dict[first_word] += 1
	else:
	firstword_count_dict[first_word] = 1

	count_sum = sum(firstword_count_dict.values())

	for first_word in firstword_count_dict:
	firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum)

	print firstword_prob_dict

	next_word_counts_dict = {}
	for word_list in poem_lines:
	for i in range(len(word_list)):
	thisword = word_list[i]
	if i < len(word_list)-1:
	nextword = word_list[i+1]
	else:
	nextword = '\n'

	if thisword not in next_word_counts_dict:
	next_word_counts_dict[thisword] = {nextword:1}
	else:
	if nextword not in next_word_counts_dict[thisword]:
	next_word_counts_dict[thisword][nextword] = 1
	else:
	next_word_counts_dict[thisword][nextword] += 1

	for thisword in next_word_counts_dict:
	count_sum = sum(next_word_counts_dict[thisword].values())
	for nextword in next_word_counts_dict[thisword]:
	old_next_word_count = next_word_counts_dict[thisword][nextword]
	next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum)

	first_states = []
	pp = []

	for key in firstword_prob_dict:
	first_states.append(key)
	pp.append(firstword_prob_dict[key])
	print first_states
	print pp

	#for i in range(len(next_word_counts_dict.items)):
	#print next_word_counts_dict.items()

	for x in range(1000):
	state = np.random.choice(first_states, p = pp)
	sentence = [state]
	for i in range(9):
	next_states = []
	p = []
	if state in next_word_counts_dict:
	for pair in next_word_counts_dict[state].items():
	next_states.append(pair[0])
	p.append(pair[1])
	state = np.random.choice(next_states, p = p)
	sentence.append(state)
	print ' '.join([w for w in sentence if w != '\n'])



	#for word in next_word_counts_dict:
	# print next_word_counts_dict[word].items()
	# np.random.choice(word, p=)
	# word, ':', next_word_counts_dict[word]
	print len(next_word_counts_dict)

	output_file = open('poem_model.json', "w")
	output_file.write(json.dumps(next_word_counts_dict))
	output_file.close()

	output_file2 = open('poem_model_firstword.json', "w")
	output_file2.write(json.dumps(firstword_prob_dict))
	output_file2.close()