Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import os
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.layers import GlobalAveragePooling1D, LSTM, Bidirectional
from bad_content import config
from bad_content.utils import show_plot_confusion_matrix, show_classification_report
warnings.filterwarnings("ignore") # We're outlaws!
def create_embedding_matrix(filepath, word_index, embedding_dim):
print('Creating embedding matrix from the glove.')
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath, encoding='utf8') as f:
for line in f:
word, *vector = line.split()
if word in word_index:
idx = word_index[word]
embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
return embedding_matrix
def train(classification_report: bool = False, plot_confusion_matrix_report: bool = False) -> None:
"""For better result while training, play https://www.youtube.com/watch?v=_YYmfM2TfUA as loud as possible."""
df = pd.read_csv('data/bad_content_clean.csv', encoding='utf-8')
df.head()
data = df.copy() # Make a copy of the data.
print(f'Value Count: {data.spam.value_counts()}')
# sns.countplot(data['spam'])
# plt.show()
X = data['content'].values
y = data['spam'].values
X_train: np.ndarray
X_test: np.ndarray
y_train: np.ndarray
y_test: np.ndarray
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Prepare the tokenizer.
t = Tokenizer()
t.fit_on_texts(X_train)
# integer encode the documents
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)
print(f'encoded_train[0:2]: {encoded_train[0:2]}')
# pad documents to a max length of 50 words.
max_length = 50
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
print(f'padded_train: {padded_train}')
vocab_size = len(t.word_index) + 1
embedding_dim = max_length
embedding_matrix = create_embedding_matrix(
f'data/glove.6B/glove.6B.{embedding_dim}d.txt',
t.word_index,
embedding_dim
)
def my_model():
# Define the model as Sequential.
model = Sequential()
# The model trains for a number of epochs and stops once it is not improving anymore.
# This is made possible by the early [stopping callback](https://keras.io/api/callbacks/early_stopping/).
# The model training might run for about 11 or 12 epochs.
# This varies because of the stochastic[https://machinelearningmastery.com/stochastic-in-machine-learning/]
# nature of the model and even data splitting.
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
# model.add(Flatten())
model.add(GlobalAveragePooling1D())
model.add(Dense(X_train.shape[0] / 4, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(X_train.shape[0] / 6, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(X_train.shape[0] / 8, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(X_train.shape[0] / 10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(f'model.summary(): {model.summary()}')
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
# checkpoint = ModelCheckpoint(
# 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
# monitor='val_accuracy',
# save_best_only=True,
# verbose=1,
# )
# fit the model
model.fit(
x=padded_train,
y=y_train,
epochs=100,
# batch_size=20,
validation_data=(padded_test, y_test),
verbose=1,
# callbacks=[checkpoint, early_stop],
callbacks=[early_stop, ],
use_multiprocessing=True
)
return model
def ltsm_model():
# LSTM hyperparameters
n_lstm = 20
drop_lstm = 0.2
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize the model
print(f'model.summary(): {model.summary()}')
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
model.fit(
padded_train,
y_train,
epochs=num_epochs,
validation_data=(padded_test, y_test),
callbacks=[early_stop],
verbose=1,
)
return model
def bi_lstm_model():
# LSTM hyperparameters
n_lstm = 20
drop_lstm = 0.2
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
# model.add(Bidirectional(CuDNNLSTM(
# units=n_lstm,
# dropout=drop_lstm,
# return_sequences=True,
# recurrent_activation='sigmoid',
# )))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize the model
print(f'model.summary(): {model.summary()}')
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
model.fit(
padded_train, y_train, epochs=num_epochs,
validation_data=(padded_test, y_test),
callbacks=[early_stop, ],
verbose=1,
use_multiprocessing=True
)
return model
model = bi_lstm_model()
preds = (model.predict(padded_test) > 0.5).astype("int32")
if classification_report:
show_classification_report(y_test, preds)
if plot_confusion_matrix_report:
show_plot_confusion_matrix(y_test, preds)
if not os.path.exists(config.__MODEL_SAVE_PATH):
os.makedirs(config.__MODEL_SAVE_PATH)
print(f'Saving model to {config.__MODEL_SAVE_PATH}')
model.save(config.__MODEL_SAVE_PATH)
with open(f'{config.__MODEL_SAVE_PATH}/tokenizer.pkl', 'wb') as output:
pickle.dump(t, output, pickle.HIGHEST_PROTOCOL)
@Cyral
Copy link

Cyral commented Sep 28, 2021

Link for the context behind this gist so I can bookmark it all in one place: https://news.ycombinator.com/item?id=28684764

Thanks for sharing!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment