afhuertass/medum1_03.py

## medum1_03.py
# -*- coding: utf-8 -*-
"""petro_trainer.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1CMt3EPhTu2wXFWUfYCxoV0ZGjA7ElJyM
"""

!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse drive

!pip install -q keras

!pip install -q unidecode


import pandas as pd
import numpy as np
import re
import unidecode

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense , CuDNNLSTM

import string
from keras.callbacks import ModelCheckpoint, EarlyStopping

from tensorflow.python.client import device_lib
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
#df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv")

df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv" , encoding="utf-8")
df["text"] = df["text"].apply( lambda x :  x.lower() )
df["text"] = df["text"].apply(lambda x:  re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
df["text"] = df["text"].apply(lambda x:  re.sub(r'^http?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))

df["text"] = df["text"].apply(lambda x:   unidecode.unidecode(x )   )

characters = list(string.printable)
characters = list(string.printable)
characters.remove('\x0b')
characters.remove('\x0c')

VOCABULARY_SIZE = len(characters)
characters_to_ix = {c:i for i,c in enumerate(characters)}
print("vocabulary len = %d" % VOCABULARY_SIZE)
print(characters)


N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN
SEQUENCE_LEN = 100
BATCH_SIZE = 512
EPOCHS = 20
HIDDEN_LAYERS_DIM = 256
LAYER_COUNT = 4
DROPOUT = 0.2


def build_model( layer_count , sequence_len , vocab_size , dropout ):
    hidden_layers_dim = HIDDEN_LAYERS_DIM
    model = Sequential()
    for i in range( layer_count ):

        model.add(
            CuDNNLSTM(
                hidden_layers_dim ,
                return_sequences=True if (i!=(layer_count-1)) else False,
                input_shape=( sequence_len , vocab_size ),
            )
        )
        model.add( Dropout(dropout))

    model.add(Dense(VOCABULARY_SIZE))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer="adam")
    return model


def describe_batch(X, y, samples=3):
    """Describe in a human-readable format some samples from a batch"""
    for i in range(samples):
        sentence = ""
        for s in range(SEQUENCE_LEN):
            sentence += characters[X[i,s,:].argmax()]
        next_char = characters[y[i,:].argmax()]

        print("sample #%d: ...%s -> '%s'" % (
            i,
            sentence[-20:],
            next_char
        ))

def batch_generator(text, count):
    """Generate batches for training"""
    while True: # keras wants that for reasons
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
            y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(SEQUENCE_LEN):
                    X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
                y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1

            yield X, y

df_full = df["text"].sample(frac=1).reset_index(drop=True)

df_train = df_full[:12000]
df_test = df_full[12000:]

#  \\ I will use this indicator of the end of a twitt
full_train_text = "\\".join( df_train[:] )
full_test_text = "\\".join( df_test[:] )

train_len = len( full_train_text)
test_len = len( full_test_text )

train_batch_count = ( train_len - SEQUENCE_LEN) // BATCH_SIZE
test_batch_count = ( test_len - SEQUENCE_LEN) // BATCH_SIZE

print(train_batch_count)
print( test_batch_count )


model = build_model( LAYER_COUNT , SEQUENCE_LEN , VOCABULARY_SIZE , DROPOUT)


filepath = "./drive/petrobot/models/petro_model"
#call backs
checkpoint = ModelCheckpoint(
    filepath,
    save_weights_only=True
)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

callbacks_list = [ checkpoint , early_stopping ]

# max_queue_size=1
history = model.fit_generator(
    batch_generator( full_train_text , count=train_batch_count ),
    train_batch_count, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    validation_data=batch_generator( full_test_text , count=test_batch_count ),
    validation_steps=test_batch_count,
    initial_epoch=0 ,
    verbose = 1
)
	# -- coding: utf-8 --
	"""petro_trainer.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1CMt3EPhTu2wXFWUfYCxoV0ZGjA7ElJyM
	"""

	!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
	!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
	!apt-get update -qq 2>&1 > /dev/null
	!apt-get -y install -qq google-drive-ocamlfuse fuse
	from google.colab import auth
	auth.authenticate_user()
	from oauth2client.client import GoogleCredentials
	creds = GoogleCredentials.get_application_default()
	import getpass
	!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 \| grep URL
	vcode = getpass.getpass()
	!echo {vcode} \| google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

	!mkdir -p drive
	!google-drive-ocamlfuse drive

	!pip install -q keras

	!pip install -q unidecode





	import pandas as pd
	import numpy as np
	import re
	import unidecode

	from keras.models import Sequential
	from keras.layers import LSTM, Dropout, Activation, Dense , CuDNNLSTM

	import string
	from keras.callbacks import ModelCheckpoint, EarlyStopping

	from tensorflow.python.client import device_lib
	from keras import backend as K
	K.tensorflow_backend._get_available_gpus()
	#df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv")

	df = pd.read_csv("./drive/petrobot/datasets/petrogustavo.csv" , encoding="utf-8")
	df["text"] = df["text"].apply( lambda x : x.lower() )
	df["text"] = df["text"].apply(lambda x: re.sub(r'^https?:\/\/.[\r\n]', '', x, flags=re.MULTILINE))
	df["text"] = df["text"].apply(lambda x: re.sub(r'^http?:\/\/.[\r\n]', '', x, flags=re.MULTILINE))

	df["text"] = df["text"].apply(lambda x: unidecode.unidecode(x ) )

	characters = list(string.printable)
	characters = list(string.printable)
	characters.remove('\x0b')
	characters.remove('\x0c')

	VOCABULARY_SIZE = len(characters)
	characters_to_ix = {c:i for i,c in enumerate(characters)}
	print("vocabulary len = %d" % VOCABULARY_SIZE)
	print(characters)


	N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN
	SEQUENCE_LEN = 100
	BATCH_SIZE = 512
	EPOCHS = 20
	HIDDEN_LAYERS_DIM = 256
	LAYER_COUNT = 4
	DROPOUT = 0.2


	def build_model( layer_count , sequence_len , vocab_size , dropout ):
	hidden_layers_dim = HIDDEN_LAYERS_DIM
	model = Sequential()
	for i in range( layer_count ):

	model.add(
	CuDNNLSTM(
	hidden_layers_dim ,
	return_sequences=True if (i!=(layer_count-1)) else False,
	input_shape=( sequence_len , vocab_size ),
	)
	)
	model.add( Dropout(dropout))

	model.add(Dense(VOCABULARY_SIZE))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer="adam")
	return model


	def describe_batch(X, y, samples=3):
	"""Describe in a human-readable format some samples from a batch"""
	for i in range(samples):
	sentence = ""
	for s in range(SEQUENCE_LEN):
	sentence += characters[X[i,s,:].argmax()]
	next_char = characters[y[i,:].argmax()]

	print("sample #%d: ...%s -> '%s'" % (
	i,
	sentence[-20:],
	next_char
	))

	def batch_generator(text, count):
	"""Generate batches for training"""
	while True: # keras wants that for reasons
	for batch_ix in range(count):
	X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
	y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))

	batch_offset = BATCH_SIZE * batch_ix

	for sample_ix in range(BATCH_SIZE):
	sample_start = batch_offset + sample_ix
	for s in range(SEQUENCE_LEN):
	X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
	y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1

	yield X, y

	df_full = df["text"].sample(frac=1).reset_index(drop=True)

	df_train = df_full[:12000]
	df_test = df_full[12000:]

	# \\ I will use this indicator of the end of a twitt
	full_train_text = "\\".join( df_train[:] )
	full_test_text = "\\".join( df_test[:] )

	train_len = len( full_train_text)
	test_len = len( full_test_text )

	train_batch_count = ( train_len - SEQUENCE_LEN) // BATCH_SIZE
	test_batch_count = ( test_len - SEQUENCE_LEN) // BATCH_SIZE

	print(train_batch_count)
	print( test_batch_count )


	model = build_model( LAYER_COUNT , SEQUENCE_LEN , VOCABULARY_SIZE , DROPOUT)



	filepath = "./drive/petrobot/models/petro_model"
	#call backs
	checkpoint = ModelCheckpoint(
	filepath,
	save_weights_only=True
	)
	early_stopping = EarlyStopping(monitor='val_loss', patience=5)

	callbacks_list = [ checkpoint , early_stopping ]

	# max_queue_size=1
	history = model.fit_generator(
	batch_generator( full_train_text , count=train_batch_count ),
	train_batch_count, # no more than one queued batch in RAM
	epochs=EPOCHS,
	callbacks=callbacks_list,
	validation_data=batch_generator( full_test_text , count=test_batch_count ),
	validation_steps=test_batch_count,
	initial_epoch=0 ,
	verbose = 1
	)