Skip to content

Instantly share code, notes, and snippets.

@laurenrace
Created April 27, 2018 03:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laurenrace/218f62b100257cf1d11ac27e9b64df09 to your computer and use it in GitHub Desktop.
Save laurenrace/218f62b100257cf1d11ac27e9b64df09 to your computer and use it in GitHub Desktop.
rwetfinal
# coding: utf-8
# In[1]:
lines = [item.strip() for item in open('letranger.txt').readlines()]
# In[2]:
no_camus_lines = [item for item in lines if not(item.startswith("Albert Camus"))]
# In[3]:
import re
# In[4]:
no_weird_brackets_lines = [re.sub(r"\[\d+\]", "", item) for item in no_camus_lines]
# In[5]:
no_weird_brackets_lines
# In[6]:
with open('letranger_clean.txt', 'w') as fh:
for item in no_weird_brackets_lines:
fh.write(item + "\n")
# In[7]:
with open('letranger.txt', 'r') as f:
src = f.read()
# src = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."
src += "$" # to indicate the end of the string
model = {}
for i in range(len(src)-2):
ngram = tuple(src[i:i+2]) # get a slice of length 2 from current position
next_item = src[i+2] # next item is current index plus two (i.e., right after the slice)
if ngram not in model: # check if we've already seen this ngram; if not...
model[ngram] = [] # value for this key is an empty list
model[ngram].append(next_item) # append this next item to the list for this ngram
# In[8]:
model
# In[9]:
import random
def gen_from_model(n, model, start=None, max_gen=100):
if start is None:
start = random.choice(list(model.keys()))
output = list(start)
for i in range(max_gen):
start = tuple(output[-n:])
next_item = random.choice(model[start])
if next_item is None:
break
else:
output.append(next_item)
return output
# In[10]:
str = ''.join(gen_from_model(2, model, ('s', 'p'), 400))
print(str)
# In[11]:
text = open("letranger.txt").read()
words = text.split()
# In[12]:
words.append("$")
model2 = {}
for i in range(len(words)-2):
ngram = tuple(words[i:i+2]) # get a slice of length 2 from current position
next_item = words[i+2] # next item is current index plus two (i.e., right after the slice)
if ngram not in model2: # check if we've already seen this ngram; if not...
model2[ngram] = [] # value for this key is an empty list
model2[ngram].append(next_item) # append this next item to the list for this ngram
# In[13]:
model2
# In[14]:
pairs = [(words[i], words[i+1]) for i in range(len(words)-1)]
# In[15]:
pairs = []
for i in range(len(words)-1):
this_pair = (words[i], words[i+1])
pairs.append(this_pair)
# In[16]:
pairs[:10]from collections import Counter
# In[17]:
from collections import Counter
# In[18]:
pair_counts = Counter(pairs)
# In[19]:
pair_counts.most_common(20)
# In[20]:
' '.join(gen_from_model(2, model2, ('je', 'ne'), 100))
# In[21]:
get_ipython().system('conda install -y keras')
# In[22]:
get_ipython().system('pip install textgenrnn')
# In[23]:
from textgenrnn import textgenrnn
# In[24]:
textgen = textgenrnn('letranger_weights.hdf5')
# In[ ]:
textgen.generate()
# In[ ]:
textgen.train_on_texts(open("letranger.txt").readlines(), num_epochs=20)
# In[ ]:
poem = textgen.generate(8, temperature=0.4, return_as_list=True)
for line in poem:
print(line.strip())
# In[ ]:
textgen.save('letranger_weights.hdf5')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment