Last active
August 4, 2020 12:59
-
-
Save rahulbtchrya/700f1b6910076466b0183f9bdfe0cdf6 to your computer and use it in GitHub Desktop.
Text classification using LSTM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import tensorflow_datasets as tfds | |
# define a tokenizer and train it on out list of words and sentences | |
tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>") | |
tokenizer.fit_on_texts(tweets_list) | |
# convert the list of sentenses to tokenized list of words | |
sentences_train = tokenizer.texts_to_sequences(tweets_train) | |
sentences_train = pad_sequences(sentences_train, maxlen=max_length, padding='post', truncating='post') | |
# define the subword tokenizer | |
tokenizer_sub = tfds.features.text.SubwordTextEncoder.build_from_corpus(tweets_list, vocab_size, max_subword_length=5) | |
# Replace sentence data with encoded subwords | |
for i, tweet in enumerate(tweets_list): | |
tweets_list[i] = tokenizer_sub.encode(tweet) sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post') | |
sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post') | |
# define the model | |
model = tf.keras.Sequential([ | |
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), | |
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)), | |
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)), | |
tf.keras.layers.Dense(6, activation='relu'), | |
tf.keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias) | |
]) | |
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-6), metrics=METRICS) | |
# train model | |
history = model.fit(sentences_train_sub, label_train, epochs=100, batch_size=BATCH_SIZE,callbacks=[lr_schedule],class_weight=class_weight) | |
# get predictions | |
preds = model.predict(x=sentences_test_sub) | |
pred_labels = [1 if pred>0.5 else 0 for pred in preds] | |
print("F1 score= ", f1_score(label_test, pred_labels)) | |
print("ROC AUC score = ", roc_auc_score(label_test, pred_labels)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment