rahulbtchrya/TwitterDNN.py

## TwitterDNN.py
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

# define a tokenizer and train it on out list of words and sentences
tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>")
tokenizer.fit_on_texts(tweets_list)
# convert the list of sentenses to tokenized list of words
sentences_train = tokenizer.texts_to_sequences(tweets_train)
sentences_train = pad_sequences(sentences_train, maxlen=max_length, padding='post', truncating='post')

# define the subword tokenizer
tokenizer_sub = tfds.features.text.SubwordTextEncoder.build_from_corpus(tweets_list, vocab_size, max_subword_length=5)
# Replace sentence data with encoded subwords
for i, tweet in enumerate(tweets_list):
    tweets_list[i] = tokenizer_sub.encode(tweet) sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')
sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')

# define the model
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)
    ])

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-6), metrics=METRICS)
# train model
history = model.fit(sentences_train_sub, label_train, epochs=100, batch_size=BATCH_SIZE,callbacks=[lr_schedule],class_weight=class_weight)

# get predictions
preds = model.predict(x=sentences_test_sub)
pred_labels = [1 if pred>0.5 else 0 for pred in preds]
print("F1 score= ", f1_score(label_test, pred_labels))
print("ROC AUC score = ", roc_auc_score(label_test, pred_labels))
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import tensorflow_datasets as tfds

	# define a tokenizer and train it on out list of words and sentences
	tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>")
	tokenizer.fit_on_texts(tweets_list)
	# convert the list of sentenses to tokenized list of words
	sentences_train = tokenizer.texts_to_sequences(tweets_train)
	sentences_train = pad_sequences(sentences_train, maxlen=max_length, padding='post', truncating='post')

	# define the subword tokenizer
	tokenizer_sub = tfds.features.text.SubwordTextEncoder.build_from_corpus(tweets_list, vocab_size, max_subword_length=5)
	# Replace sentence data with encoded subwords
	for i, tweet in enumerate(tweets_list):
	tweets_list[i] = tokenizer_sub.encode(tweet) sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')
	sentences_sub = pad_sequences(tweets_list, maxlen=max_length,padding='post',truncating='post')

	# define the model
	model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
	tf.keras.layers.Dense(6, activation='relu'),
	tf.keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)
	])

	model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-6), metrics=METRICS)
	# train model
	history = model.fit(sentences_train_sub, label_train, epochs=100, batch_size=BATCH_SIZE,callbacks=[lr_schedule],class_weight=class_weight)

	# get predictions
	preds = model.predict(x=sentences_test_sub)
	pred_labels = [1 if pred>0.5 else 0 for pred in preds]
	print("F1 score= ", f1_score(label_test, pred_labels))
	print("ROC AUC score = ", roc_auc_score(label_test, pred_labels))