Skip to content

Instantly share code, notes, and snippets.

@ssophwang
Created March 25, 2017 15:24
Show Gist options
  • Save ssophwang/3ee0c8ce3e0fc5f74443dcbcd87386a7 to your computer and use it in GitHub Desktop.
Save ssophwang/3ee0c8ce3e0fc5f74443dcbcd87386a7 to your computer and use it in GitHub Desktop.
import re, random
import numpy as np
import json
poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt']
poem_lines = []
firstword_count_dict = {}
firstword_prob_dict = {}
for poem_file in poem_files:
f = open(poem_file, 'r')
for line in f:
if len(line) > 1:
words = re.findall('\w+', line)
if len(words) > 1:
poem_lines.append(words)
f.close()
#print poem_lines
unicorns = set()
words = []
last = 0
lines = 0
for line in poem_lines:
lines += 1
if len(line) >= 1 and line[-1] == 'weary':
last += 1
for word in line:
unicorns.add(word)
words.append(word)
# firstword_counts = {}
#
# for line in poem_lines:
# if len(line) > 2:
# first_word = line[0]
#
# if first_word in firstword_counts:
# firstword_counts[first_word] += 1
# else:
# firstword_counts[first_word] = 1
#
# print firstword_counts
# words = words.replace(',', '').replace('.', ' ')
#
# word_states = re.findall('\w+', words)
#
# print word_states
# print len(set(word_states))
#
# counts_dict = {}
#
# for i in range(len(word_states)-1):
# first_word = word_states[i]
# next_word = word_states[i+1]
#
# if (first_word, next_word) in counts_dict:
# counts_dict[(first_word,next_word)] += 1
# else:
# counts_dict[(first_word,next_word)] = 1
#
# transition_probabilities = {}
# s = sum(counts_dict.values())
#
#
# for key in counts_dict:
# transition_probabilities[key] = float(counts_dict[key])/s
# print transition_probabilities
# # float(counts_dict.keys())/s
# #for x in range(10):
# # sentence = ''
# # for i in range(10):
# # word = results[random.randint(0, len(results) - 1)]
# # sentence += ' ' + word
# # print sentence
# next_word_counts_dict = {}
# for word_list in poem_lines:
# word_list.append('\n')
# for i in range(len(word_list) - 1):
# thisword = word_list[i]
# nextword = word_list[i+1]
#
# if thisword not in next_word_counts_dict:
# next_word_counts_dict[thisword] = {nextword:1}
# else:
# if nextword not in next_word_counts_dict[thisword]:
# next_word_counts_dict[thisword][nextword] = 1
# else:
# next_word_counts_dict[thisword][nextword] += 1
for line in poem_lines:
if len(line) > 0:
first_word = line[0]
if first_word in firstword_count_dict:
firstword_count_dict[first_word] += 1
else:
firstword_count_dict[first_word] = 1
count_sum = sum(firstword_count_dict.values())
for first_word in firstword_count_dict:
firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum)
print firstword_prob_dict
next_word_counts_dict = {}
for word_list in poem_lines:
for i in range(len(word_list)):
thisword = word_list[i]
if i < len(word_list)-1:
nextword = word_list[i+1]
else:
nextword = '\n'
if thisword not in next_word_counts_dict:
next_word_counts_dict[thisword] = {nextword:1}
else:
if nextword not in next_word_counts_dict[thisword]:
next_word_counts_dict[thisword][nextword] = 1
else:
next_word_counts_dict[thisword][nextword] += 1
for thisword in next_word_counts_dict:
count_sum = sum(next_word_counts_dict[thisword].values())
for nextword in next_word_counts_dict[thisword]:
old_next_word_count = next_word_counts_dict[thisword][nextword]
next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum)
first_states = []
pp = []
for key in firstword_prob_dict:
first_states.append(key)
pp.append(firstword_prob_dict[key])
print first_states
print pp
#for i in range(len(next_word_counts_dict.items)):
#print next_word_counts_dict.items()
for x in range(1000):
state = np.random.choice(first_states, p = pp)
sentence = [state]
for i in range(9):
next_states = []
p = []
if state in next_word_counts_dict:
for pair in next_word_counts_dict[state].items():
next_states.append(pair[0])
p.append(pair[1])
state = np.random.choice(next_states, p = p)
sentence.append(state)
print ' '.join([w for w in sentence if w != '\n'])
#for word in next_word_counts_dict:
# print next_word_counts_dict[word].items()
# np.random.choice(word, p=)
# word, ':', next_word_counts_dict[word]
print len(next_word_counts_dict)
output_file = open('poem_model.json', "w")
output_file.write(json.dumps(next_word_counts_dict))
output_file.close()
output_file2 = open('poem_model_firstword.json', "w")
output_file2.write(json.dumps(firstword_prob_dict))
output_file2.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment