Aqua-4/keras_train.py

## keras_train.py
"""
Here's a high-level overview of the recommended architecture for text correction using a Seq2Seq model:

1. Encoder:
   - Input: Incorrect text sequence
   - Apply word embeddings to convert each word into a fixed-length vector representation
   - Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to capture the contextual information of the input sequence
   - The final hidden state(s) of the encoder will serve as the initial state of the decoder

2. Decoder:
   - Input: Corrected text sequence (teacher forcing)
   - Apply word embeddings to convert each word into a fixed-length vector representation
   - Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to generate the corrected text
   - Optionally, you can use attention mechanisms to help the model focus on relevant parts of the input during the decoding process

3. Training:
   - During training, the model is fed with pairs of incorrect and correct text sequences
   - The incorrect sequence serves as the input to the encoder, and the correct sequence (shifted by one time step) serves as the input to the decoder
   - The decoder is trained to predict the correct sequence based on the incorrect sequence

4. Inference:
   - During inference, given an incorrect text sequence, the encoder is used to encode the input sequence and obtain the initial hidden state(s) of the decoder
   - The decoder is then iteratively fed with the predicted word from the previous time step until an end-of-sequence token or a maximum sequence length is reached

"""

import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pickle

batch_size = 128
epochs = 50
latent_dim = 256
# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='accuracy', min_delta=0.001, patience=2, mode='auto', verbose=2, baseline=None)
# early_stopping = EarlyStopping(
#     monitor='loss', min_delta=0.001, patience=1, mode='auto', verbose=2, baseline=None)


# Load the data
# df = pd.read_csv('./data/training_state_county.csv')
# df = pd.read_csv('./data/training_state_county_city.csv')
df = pd.read_csv('./data/training_simple_sample.csv')

# Convert all text to lowercase
df['typo_address'] = df['typo_address'].str.lower()
df['address'] = df['address'].str.lower()

# Retrieve the corrected addresses and typo addresses
corrections = list(df['address'])
addresses = list(df['typo_address'])

# Split the data into training and test sets
train_addresses, test_addresses, train_corrections, test_corrections = train_test_split(
    addresses, corrections, test_size=0.1, random_state=47
)

# Tokenize the addresses and corrections
correct_tokenizer = Tokenizer(oov_token=True)
correct_tokenizer.fit_on_texts(train_corrections)
# Save the tokenizer
with open('correct_tokenizer.pickle', 'wb') as handle:
    pickle.dump(correct_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)


typo_tokenizer = Tokenizer(oov_token=True)
typo_tokenizer.fit_on_texts(train_addresses)
# Save the tokenizer
with open('typo_tokenizer.pickle', 'wb') as handle:
    pickle.dump(typo_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)


# Convert addresses and corrections to sequences
train_address_sequences = typo_tokenizer.texts_to_sequences(train_addresses)
train_correction_sequences = correct_tokenizer.texts_to_sequences(
    train_corrections)

test_address_sequences = typo_tokenizer.texts_to_sequences(test_addresses)
test_correction_sequences = correct_tokenizer.texts_to_sequences(
    test_corrections)

# Pad sequences to have the same length
max_length = max(
    max(map(len, train_address_sequences)),
    max(map(len, train_correction_sequences)),
    max(map(len, test_address_sequences)),
    max(map(len, test_correction_sequences))
)
padded_train_addresses = pad_sequences(
    train_address_sequences, maxlen=max_length, padding='post')
padded_train_corrections_input = pad_sequences(
    train_correction_sequences, maxlen=max_length, padding='post')

# Separate the correct addresses for training the decoder
train_correct_addresses = train_corrections
train_correct_sequences = correct_tokenizer.texts_to_sequences(
    train_correct_addresses)
padded_train_corrections_output = pad_sequences(
    train_correct_sequences, maxlen=max_length, padding='post')

padded_test_addresses = pad_sequences(
    test_address_sequences, maxlen=max_length, padding='post')
padded_test_corrections_input = pad_sequences(
    test_correction_sequences, maxlen=max_length, padding='post')
padded_test_corrections_output = pad_sequences(
    test_correction_sequences, maxlen=max_length, padding='post')

# TODO: use seperate tokeninzer for typo & correct address
# Define the encoder model
encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(input_dim=len(
    typo_tokenizer.word_index) + 1, output_dim=100)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# TODO: why do i have 2 decoders
# Define the decoder model
decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(input_dim=len(
    correct_tokenizer.word_index) + 1, output_dim=100)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
    decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(
    len(correct_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# TODO: check these params
# Define the seq2seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# TODO: fnd the best optimizer & loss function for text corection problems
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(
    [padded_train_addresses, padded_train_corrections_input],
    # np.expand_dims(padded_train_corrections_output, axis=-1),
    padded_train_corrections_output,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping],
)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(
    [padded_test_addresses, padded_test_corrections_input],
    padded_test_corrections_output
)

# Save the model
model.save('address_correction_model.h5')

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

print('Summary:', model.summary())

# # Access the layers in the model
# layers = model.layers

# # Iterate over the layers and print their properties
# for layer in layers:
#     print(layer.name)
#     print(layer.get_config())
	"""
	Here's a high-level overview of the recommended architecture for text correction using a Seq2Seq model:

	1. Encoder:
	- Input: Incorrect text sequence
	- Apply word embeddings to convert each word into a fixed-length vector representation
	- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to capture the contextual information of the input sequence
	- The final hidden state(s) of the encoder will serve as the initial state of the decoder

	2. Decoder:
	- Input: Corrected text sequence (teacher forcing)
	- Apply word embeddings to convert each word into a fixed-length vector representation
	- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to generate the corrected text
	- Optionally, you can use attention mechanisms to help the model focus on relevant parts of the input during the decoding process

	3. Training:
	- During training, the model is fed with pairs of incorrect and correct text sequences
	- The incorrect sequence serves as the input to the encoder, and the correct sequence (shifted by one time step) serves as the input to the decoder
	- The decoder is trained to predict the correct sequence based on the incorrect sequence

	4. Inference:
	- During inference, given an incorrect text sequence, the encoder is used to encode the input sequence and obtain the initial hidden state(s) of the decoder
	- The decoder is then iteratively fed with the predicted word from the previous time step until an end-of-sequence token or a maximum sequence length is reached

	"""

	import numpy as np
	import pandas as pd
	from tensorflow.keras.models import Model
	from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.callbacks import EarlyStopping
	from sklearn.model_selection import train_test_split
	import pickle

	batch_size = 128
	epochs = 50
	latent_dim = 256
	# Define the EarlyStopping callback
	early_stopping = EarlyStopping(
	monitor='accuracy', min_delta=0.001, patience=2, mode='auto', verbose=2, baseline=None)
	# early_stopping = EarlyStopping(
	# monitor='loss', min_delta=0.001, patience=1, mode='auto', verbose=2, baseline=None)


	# Load the data
	# df = pd.read_csv('./data/training_state_county.csv')
	# df = pd.read_csv('./data/training_state_county_city.csv')
	df = pd.read_csv('./data/training_simple_sample.csv')

	# Convert all text to lowercase
	df['typo_address'] = df['typo_address'].str.lower()
	df['address'] = df['address'].str.lower()

	# Retrieve the corrected addresses and typo addresses
	corrections = list(df['address'])
	addresses = list(df['typo_address'])

	# Split the data into training and test sets
	train_addresses, test_addresses, train_corrections, test_corrections = train_test_split(
	addresses, corrections, test_size=0.1, random_state=47
	)

	# Tokenize the addresses and corrections
	correct_tokenizer = Tokenizer(oov_token=True)
	correct_tokenizer.fit_on_texts(train_corrections)
	# Save the tokenizer
	with open('correct_tokenizer.pickle', 'wb') as handle:
	pickle.dump(correct_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)


	typo_tokenizer = Tokenizer(oov_token=True)
	typo_tokenizer.fit_on_texts(train_addresses)
	# Save the tokenizer
	with open('typo_tokenizer.pickle', 'wb') as handle:
	pickle.dump(typo_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)


	# Convert addresses and corrections to sequences
	train_address_sequences = typo_tokenizer.texts_to_sequences(train_addresses)
	train_correction_sequences = correct_tokenizer.texts_to_sequences(
	train_corrections)

	test_address_sequences = typo_tokenizer.texts_to_sequences(test_addresses)
	test_correction_sequences = correct_tokenizer.texts_to_sequences(
	test_corrections)

	# Pad sequences to have the same length
	max_length = max(
	max(map(len, train_address_sequences)),
	max(map(len, train_correction_sequences)),
	max(map(len, test_address_sequences)),
	max(map(len, test_correction_sequences))
	)
	padded_train_addresses = pad_sequences(
	train_address_sequences, maxlen=max_length, padding='post')
	padded_train_corrections_input = pad_sequences(
	train_correction_sequences, maxlen=max_length, padding='post')

	# Separate the correct addresses for training the decoder
	train_correct_addresses = train_corrections
	train_correct_sequences = correct_tokenizer.texts_to_sequences(
	train_correct_addresses)
	padded_train_corrections_output = pad_sequences(
	train_correct_sequences, maxlen=max_length, padding='post')

	padded_test_addresses = pad_sequences(
	test_address_sequences, maxlen=max_length, padding='post')
	padded_test_corrections_input = pad_sequences(
	test_correction_sequences, maxlen=max_length, padding='post')
	padded_test_corrections_output = pad_sequences(
	test_correction_sequences, maxlen=max_length, padding='post')

	# TODO: use seperate tokeninzer for typo & correct address
	# Define the encoder model
	encoder_inputs = Input(shape=(max_length,))
	encoder_embedding = Embedding(input_dim=len(
	typo_tokenizer.word_index) + 1, output_dim=100)(encoder_inputs)
	encoder_lstm = LSTM(latent_dim, return_state=True)
	encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
	encoder_states = [state_h, state_c]

	# TODO: why do i have 2 decoders
	# Define the decoder model
	decoder_inputs = Input(shape=(max_length,))
	decoder_embedding = Embedding(input_dim=len(
	correct_tokenizer.word_index) + 1, output_dim=100)(decoder_inputs)
	decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
	decoder_outputs, _, _ = decoder_lstm(
	decoder_embedding, initial_state=encoder_states)
	decoder_dense = Dense(
	len(correct_tokenizer.word_index) + 1, activation='softmax')
	decoder_outputs = decoder_dense(decoder_outputs)

	# TODO: check these params
	# Define the seq2seq model
	model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

	# TODO: fnd the best optimizer & loss function for text corection problems
	# Compile the model
	model.compile(optimizer='adam',
	loss='sparse_categorical_crossentropy', metrics=['accuracy'])

	model.fit(
	[padded_train_addresses, padded_train_corrections_input],
	# np.expand_dims(padded_train_corrections_output, axis=-1),
	padded_train_corrections_output,
	epochs=epochs,
	batch_size=batch_size,
	callbacks=[early_stopping],
	)

	# Evaluate the model on test data
	test_loss, test_accuracy = model.evaluate(
	[padded_test_addresses, padded_test_corrections_input],
	padded_test_corrections_output
	)

	# Save the model
	model.save('address_correction_model.h5')

	print(f"Test Loss: {test_loss}")
	print(f"Test Accuracy: {test_accuracy}")

	print('Summary:', model.summary())

	# # Access the layers in the model
	# layers = model.layers

	# # Iterate over the layers and print their properties
	# for layer in layers:
	# print(layer.name)
	# print(layer.get_config())