Skip to content

Instantly share code, notes, and snippets.

@rahulbtchrya
Last active August 4, 2020 12:59
Show Gist options
  • Save rahulbtchrya/700f1b6910076466b0183f9bdfe0cdf6 to your computer and use it in GitHub Desktop.
Save rahulbtchrya/700f1b6910076466b0183f9bdfe0cdf6 to your computer and use it in GitHub Desktop.
Text classification using LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
# define a tokenizer and train it on out list of words and sentences
tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>")
tokenizer.fit_on_texts(tweets_list)
# convert the list of sentenses to tokenized list of words
sentences_train = tokenizer.texts_to_sequences(tweets_train)
sentences_train = pad_sequences(sentences_train, maxlen=max_length, padding='post', truncating='post')
# define the subword tokenizer
tokenizer_sub = tfds.features.text.SubwordTextEncoder.build_from_corpus(tweets_list, vocab_size, max_subword_length=5)
# Replace sentence data with encoded subwords
for i, tweet in enumerate(tweets_list):
tweets_list[i] = tokenizer_sub.encode(tweet) sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')
sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')
# define the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)
])
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-6), metrics=METRICS)
# train model
history = model.fit(sentences_train_sub, label_train, epochs=100, batch_size=BATCH_SIZE,callbacks=[lr_schedule],class_weight=class_weight)
# get predictions
preds = model.predict(x=sentences_test_sub)
pred_labels = [1 if pred>0.5 else 0 for pred in preds]
print("F1 score= ", f1_score(label_test, pred_labels))
print("ROC AUC score = ", roc_auc_score(label_test, pred_labels))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment