elsheikh21/model_bert_embeddings.py

## model_bert_embeddings.py
import os
import yaml
import numpy as np
from argparse import ArgumentParser

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import (LSTM, Softmax, Add, Bidirectional, Dense, Input, TimeDistributed, Embedding)

from tensorflow.keras.preprocessing.sequence import pad_sequences

import mxnet as mx
from bert_embedding import BertEmbedding
from train import train_model

try:
    from bert.tokenization import FullTokenizer
except ModuleNotFoundError:
    os.system('pip install bert-tensorflow')

from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tqdm import tqdm

from keras_bert import BertEmbeddingLayer
from model_utils import visualize_plot_mdl
from parsing_dataset import load_dataset
from utilities import configure_tf, initialize_logger


def parse_args():
    parser = ArgumentParser(description="WSD")
    parser.add_argument("--model_type", default='baseline', type=str,
                        help="""Choose the model: baseline: BiLSTM Model.
                                attention: Attention Stacked BiLSTM Model.
                                seq2seq: Seq2Seq Attention.""")

    return vars(parser.parse_args())


def train_model(mdl, data, epochs=1, batch_size=32):
    [train_input_ids, train_input_masks, train_segment_ids], train_labels = data
    history = mdl.fit([train_input_ids, train_input_masks, train_segment_ids],
                      train_labels, epochs=epochs, batch_size=batch_size)
    return history


def baseline_model(output_size, max_seq_len, visualize=False, plot=False):
    hidden_size = 128
    in_id = Input(shape=(max_seq_len,), name="input_ids")
    in_mask = Input(shape=(max_seq_len,), name="input_masks")
    in_segment = Input(shape=(max_seq_len,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_embeddings = BertEmbeddingLayer()(bert_inputs)

    bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True)
                           )(bert_embeddings)

    output = TimeDistributed(Dense(output_size, activation='softmax'))(bilstm)

    mdl = Model(inputs=bert_inputs, outputs=output, name="Bert_BiLSTM")

    mdl.compile(loss="sparse_categorical_crossentropy",
                optimizer='adam', metrics=["acc"])

    visualize_plot_mdl(visualize, plot, mdl)

    return mdl


def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)


class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  batches could cause silent errors.
  """


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The un-tokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The un-tokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def convert_single_example(tokenizer, example, max_seq_length=512):
    """Converts a single InputExample into a single InputFeatures."""
    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0: (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


def convert_examples_to_features(tokenizer, examples, max_seq_length=512):
    """Convert a set of InputExamples to a list of InputFeatures."""
    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in examples:
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids).astype(np.int32),
        np.array(input_masks).astype(np.int32),
        np.array(segment_ids).astype(np.int32),
        np.array(labels),
    )


def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(
                guid=None, text_a=" ".join(text), text_b=None, label=label
            )
        )
    return InputExamples


def create_tokenizer_from_hub_module(bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


# Initialize session
sess = tf.Session()

params = parse_args()
initialize_logger()
configure_tf()

# Load our config file
config_file_path = os.path.join(os.getcwd(), "config.yaml")
config_file = open(config_file_path)
config_params = yaml.load(config_file)

elmo = config_params["use_elmo"]
dataset = load_dataset(elmo=elmo)
vocabulary_size = dataset.get("vocabulary_size")
output_size = dataset.get("output_size")

# Parse data in Bert format
max_seq_len = 512
train_x = dataset.get("train_x")

train_text = []
for example in train_x:
    train_text.append(" ".join(str(n) for n in example))

train_text = [' '.join(t.split()[0:max_seq_len]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
# print(train_text.shape)  # (37_184, 1)
train_labels = dataset.get("train_y")

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_labels)

# Extract features
(train_input_ids, train_input_masks,
 train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,
                                                                 train_examples,
                                                                 max_seq_len=max_seq_len)

bert_inputs = [train_input_ids, train_input_masks, train_segment_ids]
data = bert_inputs, train_labels
del dataset

model = baseline_model(output_size, max_seq_len, visualize=True)

# Instantiate variables
initialize_vars(sess)

history = train_model(model, dataset, config_params, elmo)
	import os
	import yaml
	import numpy as np
	from argparse import ArgumentParser

	import tensorflow as tf
	import tensorflow_hub as hub
	from tensorflow.keras.layers import (LSTM, Softmax, Add, Bidirectional, Dense, Input, TimeDistributed, Embedding)

	from tensorflow.keras.preprocessing.sequence import pad_sequences

	import mxnet as mx
	from bert_embedding import BertEmbedding
	from train import train_model

	try:
	from bert.tokenization import FullTokenizer
	except ModuleNotFoundError:
	os.system('pip install bert-tensorflow')

	from tensorflow.keras.models import Model
	from tensorflow.keras import backend as K
	from tqdm import tqdm

	from keras_bert import BertEmbeddingLayer
	from model_utils import visualize_plot_mdl
	from parsing_dataset import load_dataset
	from utilities import configure_tf, initialize_logger


	def parse_args():
	parser = ArgumentParser(description="WSD")
	parser.add_argument("--model_type", default='baseline', type=str,
	help="""Choose the model: baseline: BiLSTM Model.
	attention: Attention Stacked BiLSTM Model.
	seq2seq: Seq2Seq Attention.""")

	return vars(parser.parse_args())


	def train_model(mdl, data, epochs=1, batch_size=32):
	[train_input_ids, train_input_masks, train_segment_ids], train_labels = data
	history = mdl.fit([train_input_ids, train_input_masks, train_segment_ids],
	train_labels, epochs=epochs, batch_size=batch_size)
	return history


	def baseline_model(output_size, max_seq_len, visualize=False, plot=False):
	hidden_size = 128
	in_id = Input(shape=(max_seq_len,), name="input_ids")
	in_mask = Input(shape=(max_seq_len,), name="input_masks")
	in_segment = Input(shape=(max_seq_len,), name="segment_ids")
	bert_inputs = [in_id, in_mask, in_segment]

	bert_embeddings = BertEmbeddingLayer()(bert_inputs)

	bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
	recurrent_dropout=0.2,
	return_sequences=True)
	)(bert_embeddings)

	output = TimeDistributed(Dense(output_size, activation='softmax'))(bilstm)

	mdl = Model(inputs=bert_inputs, outputs=output, name="Bert_BiLSTM")

	mdl.compile(loss="sparse_categorical_crossentropy",
	optimizer='adam', metrics=["acc"])

	visualize_plot_mdl(visualize, plot, mdl)

	return mdl


	def initialize_vars(sess):
	sess.run(tf.local_variables_initializer())
	sess.run(tf.global_variables_initializer())
	sess.run(tf.tables_initializer())
	K.set_session(sess)


	class PaddingInputExample(object):
	"""Fake example so the num input examples is a multiple of the batch size.
	When running eval/predict on the TPU, we need to pad the number of examples
	to be a multiple of the batch size, because the TPU requires a fixed batch
	size. The alternative is to drop the last batch, which is bad because it means
	the entire output data won't be generated.
	We use this class instead of `None` because treating `None` as padding
	batches could cause silent errors.
	"""


	class InputExample(object):
	"""A single training/test example for simple sequence classification."""

	def __init__(self, guid, text_a, text_b=None, label=None):
	"""Constructs a InputExample.
	Args:
	guid: Unique id for the example.
	text_a: string. The un-tokenized text of the first sequence. For single
	sequence tasks, only this sequence must be specified.
	text_b: (Optional) string. The un-tokenized text of the second sequence.
	Only must be specified for sequence pair tasks.
	label: (Optional) string. The label of the example. This should be
	specified for train and dev examples, but not for test examples.
	"""
	self.guid = guid
	self.text_a = text_a
	self.text_b = text_b
	self.label = label


	def convert_single_example(tokenizer, example, max_seq_length=512):
	"""Converts a single InputExample into a single InputFeatures."""
	if isinstance(example, PaddingInputExample):
	input_ids = [0] * max_seq_length
	input_mask = [0] * max_seq_length
	segment_ids = [0] * max_seq_length
	label = 0
	return input_ids, input_mask, segment_ids, label

	tokens_a = tokenizer.tokenize(example.text_a)
	if len(tokens_a) > max_seq_length - 2:
	tokens_a = tokens_a[0: (max_seq_length - 2)]

	tokens = []
	segment_ids = []
	tokens.append("[CLS]")
	segment_ids.append(0)
	for token in tokens_a:
	tokens.append(token)
	segment_ids.append(0)
	tokens.append("[SEP]")
	segment_ids.append(0)

	input_ids = tokenizer.convert_tokens_to_ids(tokens)

	# The mask has 1 for real tokens and 0 for padding tokens. Only real
	# tokens are attended to.
	input_mask = [1] * len(input_ids)

	# Zero-pad up to the sequence length.
	while len(input_ids) < max_seq_length:
	input_ids.append(0)
	input_mask.append(0)
	segment_ids.append(0)

	assert len(input_ids) == max_seq_length
	assert len(input_mask) == max_seq_length
	assert len(segment_ids) == max_seq_length

	return input_ids, input_mask, segment_ids, example.label


	def convert_examples_to_features(tokenizer, examples, max_seq_length=512):
	"""Convert a set of InputExamples to a list of InputFeatures."""
	input_ids, input_masks, segment_ids, labels = [], [], [], []
	for example in examples:
	input_id, input_mask, segment_id, label = convert_single_example(
	tokenizer, example, max_seq_length
	)
	input_ids.append(input_id)
	input_masks.append(input_mask)
	segment_ids.append(segment_id)
	labels.append(label)
	return (
	np.array(input_ids).astype(np.int32),
	np.array(input_masks).astype(np.int32),
	np.array(segment_ids).astype(np.int32),
	np.array(labels),
	)


	def convert_text_to_examples(texts, labels):
	"""Create InputExamples"""
	InputExamples = []
	for text, label in zip(texts, labels):
	InputExamples.append(
	InputExample(
	guid=None, text_a=" ".join(text), text_b=None, label=label
	)
	)
	return InputExamples


	def create_tokenizer_from_hub_module(bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):
	"""Get the vocab file and casing info from the Hub module."""
	bert_module = hub.Module(bert_path)
	tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
	vocab_file, do_lower_case = sess.run(
	[
	tokenization_info["vocab_file"],
	tokenization_info["do_lower_case"],
	]
	)

	return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


	# Initialize session
	sess = tf.Session()

	params = parse_args()
	initialize_logger()
	configure_tf()

	# Load our config file
	config_file_path = os.path.join(os.getcwd(), "config.yaml")
	config_file = open(config_file_path)
	config_params = yaml.load(config_file)

	elmo = config_params["use_elmo"]
	dataset = load_dataset(elmo=elmo)
	vocabulary_size = dataset.get("vocabulary_size")
	output_size = dataset.get("output_size")

	# Parse data in Bert format
	max_seq_len = 512
	train_x = dataset.get("train_x")

	train_text = []
	for example in train_x:
	train_text.append(" ".join(str(n) for n in example))

	train_text = [' '.join(t.split()[0:max_seq_len]) for t in train_text]
	train_text = np.array(train_text, dtype=object)[:, np.newaxis]
	# print(train_text.shape) # (37_184, 1)
	train_labels = dataset.get("train_y")

	# Instantiate tokenizer
	tokenizer = create_tokenizer_from_hub_module()

	# Convert data to InputExample format
	train_examples = convert_text_to_examples(train_text, train_labels)

	# Extract features
	(train_input_ids, train_input_masks,
	train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,
	train_examples,
	max_seq_len=max_seq_len)

	bert_inputs = [train_input_ids, train_input_masks, train_segment_ids]
	data = bert_inputs, train_labels
	del dataset

	model = baseline_model(output_size, max_seq_len, visualize=True)

	# Instantiate variables
	initialize_vars(sess)

	history = train_model(model, dataset, config_params, elmo)