Skip to content

Instantly share code, notes, and snippets.

@Aqua-4
Created July 22, 2023 06:49
Show Gist options
  • Save Aqua-4/fbb3c61b1fc128e3224cc4e38997e482 to your computer and use it in GitHub Desktop.
Save Aqua-4/fbb3c61b1fc128e3224cc4e38997e482 to your computer and use it in GitHub Desktop.
Keras training with auto stop
"""
Here's a high-level overview of the recommended architecture for text correction using a Seq2Seq model:
1. Encoder:
- Input: Incorrect text sequence
- Apply word embeddings to convert each word into a fixed-length vector representation
- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to capture the contextual information of the input sequence
- The final hidden state(s) of the encoder will serve as the initial state of the decoder
2. Decoder:
- Input: Corrected text sequence (teacher forcing)
- Apply word embeddings to convert each word into a fixed-length vector representation
- Pass the embedded input sequence through one or more recurrent layers, such as LSTM or GRU, to generate the corrected text
- Optionally, you can use attention mechanisms to help the model focus on relevant parts of the input during the decoding process
3. Training:
- During training, the model is fed with pairs of incorrect and correct text sequences
- The incorrect sequence serves as the input to the encoder, and the correct sequence (shifted by one time step) serves as the input to the decoder
- The decoder is trained to predict the correct sequence based on the incorrect sequence
4. Inference:
- During inference, given an incorrect text sequence, the encoder is used to encode the input sequence and obtain the initial hidden state(s) of the decoder
- The decoder is then iteratively fed with the predicted word from the previous time step until an end-of-sequence token or a maximum sequence length is reached
"""
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pickle
batch_size = 128
epochs = 50
latent_dim = 256
# Define the EarlyStopping callback
early_stopping = EarlyStopping(
monitor='accuracy', min_delta=0.001, patience=2, mode='auto', verbose=2, baseline=None)
# early_stopping = EarlyStopping(
# monitor='loss', min_delta=0.001, patience=1, mode='auto', verbose=2, baseline=None)
# Load the data
# df = pd.read_csv('./data/training_state_county.csv')
# df = pd.read_csv('./data/training_state_county_city.csv')
df = pd.read_csv('./data/training_simple_sample.csv')
# Convert all text to lowercase
df['typo_address'] = df['typo_address'].str.lower()
df['address'] = df['address'].str.lower()
# Retrieve the corrected addresses and typo addresses
corrections = list(df['address'])
addresses = list(df['typo_address'])
# Split the data into training and test sets
train_addresses, test_addresses, train_corrections, test_corrections = train_test_split(
addresses, corrections, test_size=0.1, random_state=47
)
# Tokenize the addresses and corrections
correct_tokenizer = Tokenizer(oov_token=True)
correct_tokenizer.fit_on_texts(train_corrections)
# Save the tokenizer
with open('correct_tokenizer.pickle', 'wb') as handle:
pickle.dump(correct_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)
typo_tokenizer = Tokenizer(oov_token=True)
typo_tokenizer.fit_on_texts(train_addresses)
# Save the tokenizer
with open('typo_tokenizer.pickle', 'wb') as handle:
pickle.dump(typo_tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)
# Convert addresses and corrections to sequences
train_address_sequences = typo_tokenizer.texts_to_sequences(train_addresses)
train_correction_sequences = correct_tokenizer.texts_to_sequences(
train_corrections)
test_address_sequences = typo_tokenizer.texts_to_sequences(test_addresses)
test_correction_sequences = correct_tokenizer.texts_to_sequences(
test_corrections)
# Pad sequences to have the same length
max_length = max(
max(map(len, train_address_sequences)),
max(map(len, train_correction_sequences)),
max(map(len, test_address_sequences)),
max(map(len, test_correction_sequences))
)
padded_train_addresses = pad_sequences(
train_address_sequences, maxlen=max_length, padding='post')
padded_train_corrections_input = pad_sequences(
train_correction_sequences, maxlen=max_length, padding='post')
# Separate the correct addresses for training the decoder
train_correct_addresses = train_corrections
train_correct_sequences = correct_tokenizer.texts_to_sequences(
train_correct_addresses)
padded_train_corrections_output = pad_sequences(
train_correct_sequences, maxlen=max_length, padding='post')
padded_test_addresses = pad_sequences(
test_address_sequences, maxlen=max_length, padding='post')
padded_test_corrections_input = pad_sequences(
test_correction_sequences, maxlen=max_length, padding='post')
padded_test_corrections_output = pad_sequences(
test_correction_sequences, maxlen=max_length, padding='post')
# TODO: use seperate tokeninzer for typo & correct address
# Define the encoder model
encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(input_dim=len(
typo_tokenizer.word_index) + 1, output_dim=100)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
# TODO: why do i have 2 decoders
# Define the decoder model
decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(input_dim=len(
correct_tokenizer.word_index) + 1, output_dim=100)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(
len(correct_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# TODO: check these params
# Define the seq2seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# TODO: fnd the best optimizer & loss function for text corection problems
# Compile the model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(
[padded_train_addresses, padded_train_corrections_input],
# np.expand_dims(padded_train_corrections_output, axis=-1),
padded_train_corrections_output,
epochs=epochs,
batch_size=batch_size,
callbacks=[early_stopping],
)
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(
[padded_test_addresses, padded_test_corrections_input],
padded_test_corrections_output
)
# Save the model
model.save('address_correction_model.h5')
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print('Summary:', model.summary())
# # Access the layers in the model
# layers = model.layers
# # Iterate over the layers and print their properties
# for layer in layers:
# print(layer.name)
# print(layer.get_config())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment