Skip to content

Instantly share code, notes, and snippets.

@afhuertass
Created July 17, 2018 03:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afhuertass/fad97fa9f45de9ea2c3ec8c2e5b0ad43 to your computer and use it in GitHub Desktop.
Save afhuertass/fad97fa9f45de9ea2c3ec8c2e5b0ad43 to your computer and use it in GitHub Desktop.
petrobot trainer
# -*- coding: utf-8 -*-
"""petro_trainer.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1CMt3EPhTu2wXFWUfYCxoV0ZGjA7ElJyM
"""
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!mkdir -p drive
!google-drive-ocamlfuse drive
!pip install -q keras
!pip install -q unidecode
import pandas as pd
import numpy as np
import re
import unidecode
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense , CuDNNLSTM
import string
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.python.client import device_lib
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
#df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv")
df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv" , encoding="utf-8")
df["text"] = df["text"].apply( lambda x : x.lower() )
df["text"] = df["text"].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
df["text"] = df["text"].apply(lambda x: re.sub(r'^http?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
df["text"] = df["text"].apply(lambda x: unidecode.unidecode(x ) )
characters = list(string.printable)
characters = list(string.printable)
characters.remove('\x0b')
characters.remove('\x0c')
VOCABULARY_SIZE = len(characters)
characters_to_ix = {c:i for i,c in enumerate(characters)}
print("vocabulary len = %d" % VOCABULARY_SIZE)
print(characters)
N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN
SEQUENCE_LEN = 100
BATCH_SIZE = 512
EPOCHS = 20
HIDDEN_LAYERS_DIM = 256
LAYER_COUNT = 4
DROPOUT = 0.2
def build_model( layer_count , sequence_len , vocab_size , dropout ):
hidden_layers_dim = HIDDEN_LAYERS_DIM
model = Sequential()
for i in range( layer_count ):
model.add(
CuDNNLSTM(
hidden_layers_dim ,
return_sequences=True if (i!=(layer_count-1)) else False,
input_shape=( sequence_len , vocab_size ),
)
)
model.add( Dropout(dropout))
model.add(Dense(VOCABULARY_SIZE))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam")
return model
def describe_batch(X, y, samples=3):
"""Describe in a human-readable format some samples from a batch"""
for i in range(samples):
sentence = ""
for s in range(SEQUENCE_LEN):
sentence += characters[X[i,s,:].argmax()]
next_char = characters[y[i,:].argmax()]
print("sample #%d: ...%s -> '%s'" % (
i,
sentence[-20:],
next_char
))
def batch_generator(text, count):
"""Generate batches for training"""
while True: # keras wants that for reasons
for batch_ix in range(count):
X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))
batch_offset = BATCH_SIZE * batch_ix
for sample_ix in range(BATCH_SIZE):
sample_start = batch_offset + sample_ix
for s in range(SEQUENCE_LEN):
X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1
yield X, y
df_full = df["text"].sample(frac=1).reset_index(drop=True)
df_train = df_full[:12000]
df_test = df_full[12000:]
# \\ I will use this indicator of the end of a twitt
full_train_text = "\\".join( df_train[:] )
full_test_text = "\\".join( df_test[:] )
train_len = len( full_train_text)
test_len = len( full_test_text )
train_batch_count = ( train_len - SEQUENCE_LEN) // BATCH_SIZE
test_batch_count = ( test_len - SEQUENCE_LEN) // BATCH_SIZE
print(train_batch_count)
print( test_batch_count )
model = build_model( LAYER_COUNT , SEQUENCE_LEN , VOCABULARY_SIZE , DROPOUT)
filepath = "./drive/petrobot/models/petro_model"
#call backs
checkpoint = ModelCheckpoint(
filepath,
save_weights_only=True
)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
callbacks_list = [ checkpoint , early_stopping ]
# max_queue_size=1
history = model.fit_generator(
batch_generator( full_train_text , count=train_batch_count ),
train_batch_count, # no more than one queued batch in RAM
epochs=EPOCHS,
callbacks=callbacks_list,
validation_data=batch_generator( full_test_text , count=test_batch_count ),
validation_steps=test_batch_count,
initial_epoch=0 ,
verbose = 1
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment