Created
July 22, 2023 06:49
-
-
Save Aqua-4/fbb3c61b1fc128e3224cc4e38997e482 to your computer and use it in GitHub Desktop.
Keras training with auto stop
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Here's a high-level overview of the recommended architecture for text correction using a Seq2Seq model: | |
1. Encoder: | |
- Input: Incorrect text sequence | |
- Apply word embeddings to convert each word into a fixed-length vector representation | |
- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to capture the contextual information of the input sequence | |
- The final hidden state(s) of the encoder will serve as the initial state of the decoder | |
2. Decoder: | |
- Input: Corrected text sequence (teacher forcing) | |
- Apply word embeddings to convert each word into a fixed-length vector representation | |
- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to generate the corrected text | |
- Optionally, you can use attention mechanisms to help the model focus on relevant parts of the input during the decoding process | |
3. Training: | |
- During training, the model is fed with pairs of incorrect and correct text sequences | |
- The incorrect sequence serves as the input to the encoder, and the correct sequence (shifted by one time step) serves as the input to the decoder | |
- The decoder is trained to predict the correct sequence based on the incorrect sequence | |
4. Inference: | |
- During inference, given an incorrect text sequence, the encoder is used to encode the input sequence and obtain the initial hidden state(s) of the decoder | |
- The decoder is then iteratively fed with the predicted word from the previous time step until an end-of-sequence token or a maximum sequence length is reached | |
""" | |
import numpy as np | |
import pandas as pd | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.callbacks import EarlyStopping | |
from sklearn.model_selection import train_test_split | |
import pickle | |
batch_size = 128 | |
epochs = 50 | |
latent_dim = 256 | |
# Define the EarlyStopping callback | |
early_stopping = EarlyStopping( | |
monitor='accuracy', min_delta=0.001, patience=2, mode='auto', verbose=2, baseline=None) | |
# early_stopping = EarlyStopping( | |
# monitor='loss', min_delta=0.001, patience=1, mode='auto', verbose=2, baseline=None) | |
# Load the data | |
# df = pd.read_csv('./data/training_state_county.csv') | |
# df = pd.read_csv('./data/training_state_county_city.csv') | |
df = pd.read_csv('./data/training_simple_sample.csv') | |
# Convert all text to lowercase | |
df['typo_address'] = df['typo_address'].str.lower() | |
df['address'] = df['address'].str.lower() | |
# Retrieve the corrected addresses and typo addresses | |
corrections = list(df['address']) | |
addresses = list(df['typo_address']) | |
# Split the data into training and test sets | |
train_addresses, test_addresses, train_corrections, test_corrections = train_test_split( | |
addresses, corrections, test_size=0.1, random_state=47 | |
) | |
# Tokenize the addresses and corrections | |
correct_tokenizer = Tokenizer(oov_token=True) | |
correct_tokenizer.fit_on_texts(train_corrections) | |
# Save the tokenizer | |
with open('correct_tokenizer.pickle', 'wb') as handle: | |
pickle.dump(correct_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL) | |
typo_tokenizer = Tokenizer(oov_token=True) | |
typo_tokenizer.fit_on_texts(train_addresses) | |
# Save the tokenizer | |
with open('typo_tokenizer.pickle', 'wb') as handle: | |
pickle.dump(typo_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL) | |
# Convert addresses and corrections to sequences | |
train_address_sequences = typo_tokenizer.texts_to_sequences(train_addresses) | |
train_correction_sequences = correct_tokenizer.texts_to_sequences( | |
train_corrections) | |
test_address_sequences = typo_tokenizer.texts_to_sequences(test_addresses) | |
test_correction_sequences = correct_tokenizer.texts_to_sequences( | |
test_corrections) | |
# Pad sequences to have the same length | |
max_length = max( | |
max(map(len, train_address_sequences)), | |
max(map(len, train_correction_sequences)), | |
max(map(len, test_address_sequences)), | |
max(map(len, test_correction_sequences)) | |
) | |
padded_train_addresses = pad_sequences( | |
train_address_sequences, maxlen=max_length, padding='post') | |
padded_train_corrections_input = pad_sequences( | |
train_correction_sequences, maxlen=max_length, padding='post') | |
# Separate the correct addresses for training the decoder | |
train_correct_addresses = train_corrections | |
train_correct_sequences = correct_tokenizer.texts_to_sequences( | |
train_correct_addresses) | |
padded_train_corrections_output = pad_sequences( | |
train_correct_sequences, maxlen=max_length, padding='post') | |
padded_test_addresses = pad_sequences( | |
test_address_sequences, maxlen=max_length, padding='post') | |
padded_test_corrections_input = pad_sequences( | |
test_correction_sequences, maxlen=max_length, padding='post') | |
padded_test_corrections_output = pad_sequences( | |
test_correction_sequences, maxlen=max_length, padding='post') | |
# TODO: use seperate tokeninzer for typo & correct address | |
# Define the encoder model | |
encoder_inputs = Input(shape=(max_length,)) | |
encoder_embedding = Embedding(input_dim=len( | |
typo_tokenizer.word_index) + 1, output_dim=100)(encoder_inputs) | |
encoder_lstm = LSTM(latent_dim, return_state=True) | |
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding) | |
encoder_states = [state_h, state_c] | |
# TODO: why do i have 2 decoders | |
# Define the decoder model | |
decoder_inputs = Input(shape=(max_length,)) | |
decoder_embedding = Embedding(input_dim=len( | |
correct_tokenizer.word_index) + 1, output_dim=100)(decoder_inputs) | |
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) | |
decoder_outputs, _, _ = decoder_lstm( | |
decoder_embedding, initial_state=encoder_states) | |
decoder_dense = Dense( | |
len(correct_tokenizer.word_index) + 1, activation='softmax') | |
decoder_outputs = decoder_dense(decoder_outputs) | |
# TODO: check these params | |
# Define the seq2seq model | |
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) | |
# TODO: fnd the best optimizer & loss function for text corection problems | |
# Compile the model | |
model.compile(optimizer='adam', | |
loss='sparse_categorical_crossentropy', metrics=['accuracy']) | |
model.fit( | |
[padded_train_addresses, padded_train_corrections_input], | |
# np.expand_dims(padded_train_corrections_output, axis=-1), | |
padded_train_corrections_output, | |
epochs=epochs, | |
batch_size=batch_size, | |
callbacks=[early_stopping], | |
) | |
# Evaluate the model on test data | |
test_loss, test_accuracy = model.evaluate( | |
[padded_test_addresses, padded_test_corrections_input], | |
padded_test_corrections_output | |
) | |
# Save the model | |
model.save('address_correction_model.h5') | |
print(f"Test Loss: {test_loss}") | |
print(f"Test Accuracy: {test_accuracy}") | |
print('Summary:', model.summary()) | |
# # Access the layers in the model | |
# layers = model.layers | |
# # Iterate over the layers and print their properties | |
# for layer in layers: | |
# print(layer.name) | |
# print(layer.get_config()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment