aodin/gym.py

## gym.py
import os
import pickle
import warnings

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.layers import GlobalAveragePooling1D, LSTM, Bidirectional

from bad_content import config
from bad_content.utils import show_plot_confusion_matrix, show_classification_report

warnings.filterwarnings("ignore")  # We're outlaws!


def create_embedding_matrix(filepath, word_index, embedding_dim):
    print('Creating embedding matrix from the glove.')
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix


def train(classification_report: bool = False, plot_confusion_matrix_report: bool = False) -> None:
    """For better result while training, play https://www.youtube.com/watch?v=_YYmfM2TfUA as loud as possible."""

    df = pd.read_csv('data/bad_content_clean.csv', encoding='utf-8')
    df.head()

    data = df.copy()  # Make a copy of the data.

    print(f'Value Count: {data.spam.value_counts()}')

    # sns.countplot(data['spam'])
    # plt.show()

    X = data['content'].values
    y = data['spam'].values

    X_train: np.ndarray
    X_test: np.ndarray
    y_train: np.ndarray
    y_test: np.ndarray

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    # Prepare the tokenizer.
    t = Tokenizer()
    t.fit_on_texts(X_train)

    # integer encode the documents
    encoded_train = t.texts_to_sequences(X_train)
    encoded_test = t.texts_to_sequences(X_test)
    print(f'encoded_train[0:2]: {encoded_train[0:2]}')

    # pad documents to a max length of 50 words.
    max_length = 50
    padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
    padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

    print(f'padded_train: {padded_train}')

    vocab_size = len(t.word_index) + 1

    embedding_dim = max_length
    embedding_matrix = create_embedding_matrix(
        f'data/glove.6B/glove.6B.{embedding_dim}d.txt',
        t.word_index,
        embedding_dim
    )

    def my_model():
        # Define the model as Sequential.
        model = Sequential()

        # The model trains for a number of epochs and stops once it is not improving anymore.
        # This is made possible by the early [stopping callback](https://keras.io/api/callbacks/early_stopping/).
        # The model training might run for about 11 or 12 epochs.
        # This varies because of the stochastic[https://machinelearningmastery.com/stochastic-in-machine-learning/]
        # nature of the model and even data splitting.
        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))

        # model.add(Flatten())
        model.add(GlobalAveragePooling1D())
        model.add(Dense(X_train.shape[0] / 4, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(X_train.shape[0] / 6, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(X_train.shape[0] / 8, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(X_train.shape[0] / 10, activation='relu'))
        model.add(Dropout(0.2))

        model.add(Dense(1, activation='sigmoid'))

        # compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        # summarize the model
        print(f'model.summary(): {model.summary()}')

        early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
        # checkpoint = ModelCheckpoint(
        # 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
        #     monitor='val_accuracy',
        #     save_best_only=True,
        #     verbose=1,
        # )

        # fit the model
        model.fit(
            x=padded_train,
            y=y_train,
            epochs=100,
            # batch_size=20,
            validation_data=(padded_test, y_test),
            verbose=1,
            # callbacks=[checkpoint, early_stop],
            callbacks=[early_stop, ],
            use_multiprocessing=True
        )

        return model

    def ltsm_model():
        # LSTM hyperparameters
        n_lstm = 20
        drop_lstm = 0.2

        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
        model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
        model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        # summarize the model
        print(f'model.summary(): {model.summary()}')

        num_epochs = 30
        early_stop = EarlyStopping(monitor='val_loss', patience=2)
        model.fit(
            padded_train,
            y_train,
            epochs=num_epochs,
            validation_data=(padded_test, y_test),
            callbacks=[early_stop],
            verbose=1,
        )

        return model

    def bi_lstm_model():
        # LSTM hyperparameters
        n_lstm = 20
        drop_lstm = 0.2
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
        model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
        # model.add(Bidirectional(CuDNNLSTM(
        #     units=n_lstm,
        #     dropout=drop_lstm,
        #     return_sequences=True,
        #     recurrent_activation='sigmoid',
        # )))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        # summarize the model
        print(f'model.summary(): {model.summary()}')

        num_epochs = 30
        early_stop = EarlyStopping(monitor='val_loss', patience=2)

        model.fit(
            padded_train, y_train, epochs=num_epochs,
            validation_data=(padded_test, y_test),
            callbacks=[early_stop, ],
            verbose=1,
            use_multiprocessing=True
        )

        return model

    model = bi_lstm_model()

    preds = (model.predict(padded_test) > 0.5).astype("int32")

    if classification_report:
        show_classification_report(y_test, preds)

    if plot_confusion_matrix_report:
        show_plot_confusion_matrix(y_test, preds)

    if not os.path.exists(config.__MODEL_SAVE_PATH):
        os.makedirs(config.__MODEL_SAVE_PATH)

    print(f'Saving model to {config.__MODEL_SAVE_PATH}')

    model.save(config.__MODEL_SAVE_PATH)

    with open(f'{config.__MODEL_SAVE_PATH}/tokenizer.pkl', 'wb') as output:
        pickle.dump(t, output, pickle.HIGHEST_PROTOCOL)
	import os
	import pickle
	import warnings

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from tensorflow.keras.callbacks import EarlyStopping
	from tensorflow.keras.layers import Dense
	from tensorflow.keras.layers import Dropout
	from tensorflow.keras.layers import Embedding
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.python.keras.layers import GlobalAveragePooling1D, LSTM, Bidirectional

	from bad_content import config
	from bad_content.utils import show_plot_confusion_matrix, show_classification_report

	warnings.filterwarnings("ignore") # We're outlaws!


	def create_embedding_matrix(filepath, word_index, embedding_dim):
	print('Creating embedding matrix from the glove.')
	vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
	embedding_matrix = np.zeros((vocab_size, embedding_dim))

	with open(filepath, encoding='utf8') as f:
	for line in f:
	word, *vector = line.split()
	if word in word_index:
	idx = word_index[word]
	embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

	return embedding_matrix


	def train(classification_report: bool = False, plot_confusion_matrix_report: bool = False) -> None:
	"""For better result while training, play https://www.youtube.com/watch?v=_YYmfM2TfUA as loud as possible."""

	df = pd.read_csv('data/bad_content_clean.csv', encoding='utf-8')
	df.head()

	data = df.copy() # Make a copy of the data.

	print(f'Value Count: {data.spam.value_counts()}')

	# sns.countplot(data['spam'])
	# plt.show()

	X = data['content'].values
	y = data['spam'].values

	X_train: np.ndarray
	X_test: np.ndarray
	y_train: np.ndarray
	y_test: np.ndarray

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

	# Prepare the tokenizer.
	t = Tokenizer()
	t.fit_on_texts(X_train)

	# integer encode the documents
	encoded_train = t.texts_to_sequences(X_train)
	encoded_test = t.texts_to_sequences(X_test)
	print(f'encoded_train[0:2]: {encoded_train[0:2]}')

	# pad documents to a max length of 50 words.
	max_length = 50
	padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
	padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

	print(f'padded_train: {padded_train}')

	vocab_size = len(t.word_index) + 1

	embedding_dim = max_length
	embedding_matrix = create_embedding_matrix(
	f'data/glove.6B/glove.6B.{embedding_dim}d.txt',
	t.word_index,
	embedding_dim
	)

	def my_model():
	# Define the model as Sequential.
	model = Sequential()

	# The model trains for a number of epochs and stops once it is not improving anymore.
	# This is made possible by the early [stopping callback](https://keras.io/api/callbacks/early_stopping/).
	# The model training might run for about 11 or 12 epochs.
	# This varies because of the stochastic[https://machinelearningmastery.com/stochastic-in-machine-learning/]
	# nature of the model and even data splitting.
	model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))

	# model.add(Flatten())
	model.add(GlobalAveragePooling1D())
	model.add(Dense(X_train.shape[0] / 4, activation='relu'))
	model.add(Dropout(0.2))

	model.add(Dense(X_train.shape[0] / 6, activation='relu'))
	model.add(Dropout(0.2))

	model.add(Dense(X_train.shape[0] / 8, activation='relu'))
	model.add(Dropout(0.2))

	model.add(Dense(X_train.shape[0] / 10, activation='relu'))
	model.add(Dropout(0.2))

	model.add(Dense(1, activation='sigmoid'))

	# compile the model
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

	# summarize the model
	print(f'model.summary(): {model.summary()}')

	early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
	# checkpoint = ModelCheckpoint(
	# 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
	# monitor='val_accuracy',
	# save_best_only=True,
	# verbose=1,
	# )

	# fit the model
	model.fit(
	x=padded_train,
	y=y_train,
	epochs=100,
	# batch_size=20,
	validation_data=(padded_test, y_test),
	verbose=1,
	# callbacks=[checkpoint, early_stop],
	callbacks=[early_stop, ],
	use_multiprocessing=True
	)

	return model

	def ltsm_model():
	# LSTM hyperparameters
	n_lstm = 20
	drop_lstm = 0.2

	model = Sequential()
	model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
	model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
	model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
	model.add(Dense(1, activation='sigmoid'))

	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

	# summarize the model
	print(f'model.summary(): {model.summary()}')

	num_epochs = 30
	early_stop = EarlyStopping(monitor='val_loss', patience=2)
	model.fit(
	padded_train,
	y_train,
	epochs=num_epochs,
	validation_data=(padded_test, y_test),
	callbacks=[early_stop],
	verbose=1,
	)

	return model

	def bi_lstm_model():
	# LSTM hyperparameters
	n_lstm = 20
	drop_lstm = 0.2
	model = Sequential()
	model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
	model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
	# model.add(Bidirectional(CuDNNLSTM(
	# units=n_lstm,
	# dropout=drop_lstm,
	# return_sequences=True,
	# recurrent_activation='sigmoid',
	# )))
	model.add(Dense(1, activation='sigmoid'))

	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

	# summarize the model
	print(f'model.summary(): {model.summary()}')

	num_epochs = 30
	early_stop = EarlyStopping(monitor='val_loss', patience=2)

	model.fit(
	padded_train, y_train, epochs=num_epochs,
	validation_data=(padded_test, y_test),
	callbacks=[early_stop, ],
	verbose=1,
	use_multiprocessing=True
	)

	return model

	model = bi_lstm_model()

	preds = (model.predict(padded_test) > 0.5).astype("int32")

	if classification_report:
	show_classification_report(y_test, preds)

	if plot_confusion_matrix_report:
	show_plot_confusion_matrix(y_test, preds)

	if not os.path.exists(config.__MODEL_SAVE_PATH):
	os.makedirs(config.__MODEL_SAVE_PATH)

	print(f'Saving model to {config.__MODEL_SAVE_PATH}')

	model.save(config.__MODEL_SAVE_PATH)

	with open(f'{config.__MODEL_SAVE_PATH}/tokenizer.pkl', 'wb') as output:
	pickle.dump(t, output, pickle.HIGHEST_PROTOCOL)