This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for fileid in gutenberg.fileids(): | |
num_chars = len(gutenberg.raw(fileid)) | |
num_words = len(gutenberg.words(fileid)) | |
num_sents = len(gutenberg.sents(fileid)) | |
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) | |
print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sample(preds, temperature=1.0): | |
preds = np.asarray(preds).astype('float64') | |
preds = np.log(preds) / temperature | |
exp_preds = np.exp(preds) | |
preds = exp_preds / np.sum(exp_preds) | |
probas = np.random.multinomial(1, preds, 1) | |
return np.argmax(probas) | |
for epoch in range(1, 60): | |
print('epoch', epoch) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras import layers | |
model = keras.models.Sequential() | |
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars)))) | |
model.add(layers.Dense(len(chars), activation='softmax')) | |
optimizer = keras.optimizers.RMSprop(lr=0.01) | |
model.compile(loss='categorical_crossentropy', optimizer=optimizer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
maxlen = 60 | |
step = 3 | |
sentences = [] | |
next_chars = [] | |
for i in range(0, len(text) - maxlen, step): | |
sentences.append(text[i: i + maxlen]) | |
next_chars.append(text[i + maxlen]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import keras | |
import numpy as np | |
path = keras.utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt') | |
text = open(path).read().lower() | |
print('Corpus length:', len(text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
babelize_shell() | |
Babel> The pig that John found looked happy | |
Babel> german | |
Babel> run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
babelize_shell() | |
NLTK Babelizer: type 'help' for a list of commands. | |
Babel> how long before the next flight to Alice Springs? | |
Babel> german | |
Babel> run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for token in sent1: | |
if token.islower(): | |
print token, 'is a lowercase word' | |
elif token.istitle(): | |
print token, 'is a titlecase word' | |
else: | |
print token, 'is punctuation' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sent1 = ['Call', 'me', 'Ishmael', '.'] | |
for xyzzy in sent1: | |
if xyzzy.endswith('l'): | |
print xyzzy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
len(set([word.lower() for word in text1 if word.isalpha()])) //16948 |
NewerOlder