Created
July 17, 2018 03:24
-
-
Save afhuertass/fad97fa9f45de9ea2c3ec8c2e5b0ad43 to your computer and use it in GitHub Desktop.
petrobot trainer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""petro_trainer.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1CMt3EPhTu2wXFWUfYCxoV0ZGjA7ElJyM | |
""" | |
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools | |
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null | |
!apt-get update -qq 2>&1 > /dev/null | |
!apt-get -y install -qq google-drive-ocamlfuse fuse | |
from google.colab import auth | |
auth.authenticate_user() | |
from oauth2client.client import GoogleCredentials | |
creds = GoogleCredentials.get_application_default() | |
import getpass | |
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL | |
vcode = getpass.getpass() | |
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} | |
!mkdir -p drive | |
!google-drive-ocamlfuse drive | |
!pip install -q keras | |
!pip install -q unidecode | |
import pandas as pd | |
import numpy as np | |
import re | |
import unidecode | |
from keras.models import Sequential | |
from keras.layers import LSTM, Dropout, Activation, Dense , CuDNNLSTM | |
import string | |
from keras.callbacks import ModelCheckpoint, EarlyStopping | |
from tensorflow.python.client import device_lib | |
from keras import backend as K | |
K.tensorflow_backend._get_available_gpus() | |
#df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv") | |
df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv" , encoding="utf-8") | |
df["text"] = df["text"].apply( lambda x : x.lower() ) | |
df["text"] = df["text"].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)) | |
df["text"] = df["text"].apply(lambda x: re.sub(r'^http?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)) | |
df["text"] = df["text"].apply(lambda x: unidecode.unidecode(x ) ) | |
characters = list(string.printable) | |
characters = list(string.printable) | |
characters.remove('\x0b') | |
characters.remove('\x0c') | |
VOCABULARY_SIZE = len(characters) | |
characters_to_ix = {c:i for i,c in enumerate(characters)} | |
print("vocabulary len = %d" % VOCABULARY_SIZE) | |
print(characters) | |
N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN | |
SEQUENCE_LEN = 100 | |
BATCH_SIZE = 512 | |
EPOCHS = 20 | |
HIDDEN_LAYERS_DIM = 256 | |
LAYER_COUNT = 4 | |
DROPOUT = 0.2 | |
def build_model( layer_count , sequence_len , vocab_size , dropout ): | |
hidden_layers_dim = HIDDEN_LAYERS_DIM | |
model = Sequential() | |
for i in range( layer_count ): | |
model.add( | |
CuDNNLSTM( | |
hidden_layers_dim , | |
return_sequences=True if (i!=(layer_count-1)) else False, | |
input_shape=( sequence_len , vocab_size ), | |
) | |
) | |
model.add( Dropout(dropout)) | |
model.add(Dense(VOCABULARY_SIZE)) | |
model.add(Activation('softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer="adam") | |
return model | |
def describe_batch(X, y, samples=3): | |
"""Describe in a human-readable format some samples from a batch""" | |
for i in range(samples): | |
sentence = "" | |
for s in range(SEQUENCE_LEN): | |
sentence += characters[X[i,s,:].argmax()] | |
next_char = characters[y[i,:].argmax()] | |
print("sample #%d: ...%s -> '%s'" % ( | |
i, | |
sentence[-20:], | |
next_char | |
)) | |
def batch_generator(text, count): | |
"""Generate batches for training""" | |
while True: # keras wants that for reasons | |
for batch_ix in range(count): | |
X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE)) | |
y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE)) | |
batch_offset = BATCH_SIZE * batch_ix | |
for sample_ix in range(BATCH_SIZE): | |
sample_start = batch_offset + sample_ix | |
for s in range(SEQUENCE_LEN): | |
X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1 | |
y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1 | |
yield X, y | |
df_full = df["text"].sample(frac=1).reset_index(drop=True) | |
df_train = df_full[:12000] | |
df_test = df_full[12000:] | |
# \\ I will use this indicator of the end of a twitt | |
full_train_text = "\\".join( df_train[:] ) | |
full_test_text = "\\".join( df_test[:] ) | |
train_len = len( full_train_text) | |
test_len = len( full_test_text ) | |
train_batch_count = ( train_len - SEQUENCE_LEN) // BATCH_SIZE | |
test_batch_count = ( test_len - SEQUENCE_LEN) // BATCH_SIZE | |
print(train_batch_count) | |
print( test_batch_count ) | |
model = build_model( LAYER_COUNT , SEQUENCE_LEN , VOCABULARY_SIZE , DROPOUT) | |
filepath = "./drive/petrobot/models/petro_model" | |
#call backs | |
checkpoint = ModelCheckpoint( | |
filepath, | |
save_weights_only=True | |
) | |
early_stopping = EarlyStopping(monitor='val_loss', patience=5) | |
callbacks_list = [ checkpoint , early_stopping ] | |
# max_queue_size=1 | |
history = model.fit_generator( | |
batch_generator( full_train_text , count=train_batch_count ), | |
train_batch_count, # no more than one queued batch in RAM | |
epochs=EPOCHS, | |
callbacks=callbacks_list, | |
validation_data=batch_generator( full_test_text , count=test_batch_count ), | |
validation_steps=test_batch_count, | |
initial_epoch=0 , | |
verbose = 1 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment