LysandreJik/compare_albert.py

## compare_albert.py
import tensorflow_hub as hub
import tensorflow as tf
import modeling
import os
import numpy as np
import tokenization

# Model size and paths
model_size = 'large'.upper()
version = 2

vocab_path = "path_to_vocab/30k-clean.model"
config_path = "path_to_config/config.json"
albert_path = "path_to_albert/albert"

# Init tokenizer
tftok = tokenization.FullTokenizer(vocab_path, spm_model_file=vocab_path)

# Create inputs
input_sentence = "this is nice".lower()
tf_input_ids_init = [tftok.convert_tokens_to_ids(tftok.tokenize(input_sentence))]
input_mask = [[1] * len(tf_input_ids_init[0])]
segment_ids = [[0] * len(tf_input_ids_init[0])]

tf_input_ids = tf.constant(tf_input_ids_init)
tf_input_mask = tf.constant(input_mask)
tf_segment_ids = tf.constant(segment_ids)
tf_dict = {"input_ids": tf_input_ids, "input_mask": tf_input_mask, "token_type_ids": tf_segment_ids}
mlm_index = 2
tf_mlm_positions = tf.constant([[mlm_index]])

# Load the config and model
albert_config = modeling.AlbertConfig.from_json_file(config_path)
albert_module_modeling = modeling.AlbertModel(albert_config, is_training=False, **tf_dict)

# Init the model from the saved HUB checkpoint
tvars = tf.trainable_variables()
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, albert_path)
tf.train.init_from_checkpoint(albert_path, assignment_map)

assert len(tvars) == len(assignment_map.items())

# Instantiate a HUB module with the right size
model = hub.Module("https://tfhub.dev/google/albert_{}/{}".format(model_size.lower(), version), trainable=False)
albert_inputs = dict(input_ids=tf_input_ids, input_mask=tf_input_mask, segment_ids=tf_segment_ids)

# Get the model outputs
albert_outputs = model(albert_inputs, signature="tokens", as_dict=True)
albert_mlm_outputs = model({**albert_inputs, "mlm_positions": tf_mlm_positions}, signature="mlm", as_dict=True)["mlm_logits"]
pooled_output = albert_outputs["pooled_output"]
sequence_output = albert_outputs["sequence_output"]

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)

    # Using the HUB module
    hub_pooled_output = sess.run(pooled_output)
    hub_sequence_output = sess.run(sequence_output)
    hub_mlm_outputs = sess.run(albert_mlm_outputs)

    # Using the TF1 implementation
    tf_embedding = sess.run(albert_module_modeling.get_embedding_output())
    tf_pooled = sess.run(albert_module_modeling.get_pooled_output())
    tf_sequence = sess.run(albert_module_modeling.get_sequence_output())

# Calculate the difference between two tensors
def difference_between_tensors(tf_tensor, tf_tensor_2):
    tf_np = np.array(tf_tensor)
    tf_np_2 = np.array(tf_tensor_2)
    return np.max(np.abs(tf_np - tf_np_2))

print("\nComparing the HUB and TF1 layers")
print("-- pooled           ", difference_between_tensors(hub_pooled_output, tf_pooled))
print("-- full transformer ", difference_between_tensors(hub_sequence_output, tf_sequence))
	import tensorflow_hub as hub
	import tensorflow as tf
	import modeling
	import os
	import numpy as np
	import tokenization

	# Model size and paths
	model_size = 'large'.upper()
	version = 2

	vocab_path = "path_to_vocab/30k-clean.model"
	config_path = "path_to_config/config.json"
	albert_path = "path_to_albert/albert"

	# Init tokenizer
	tftok = tokenization.FullTokenizer(vocab_path, spm_model_file=vocab_path)

	# Create inputs
	input_sentence = "this is nice".lower()
	tf_input_ids_init = [tftok.convert_tokens_to_ids(tftok.tokenize(input_sentence))]
	input_mask = [[1] * len(tf_input_ids_init[0])]
	segment_ids = [[0] * len(tf_input_ids_init[0])]

	tf_input_ids = tf.constant(tf_input_ids_init)
	tf_input_mask = tf.constant(input_mask)
	tf_segment_ids = tf.constant(segment_ids)
	tf_dict = {"input_ids": tf_input_ids, "input_mask": tf_input_mask, "token_type_ids": tf_segment_ids}
	mlm_index = 2
	tf_mlm_positions = tf.constant([[mlm_index]])

	# Load the config and model
	albert_config = modeling.AlbertConfig.from_json_file(config_path)
	albert_module_modeling = modeling.AlbertModel(albert_config, is_training=False, **tf_dict)

	# Init the model from the saved HUB checkpoint
	tvars = tf.trainable_variables()
	(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, albert_path)
	tf.train.init_from_checkpoint(albert_path, assignment_map)

	assert len(tvars) == len(assignment_map.items())

	# Instantiate a HUB module with the right size
	model = hub.Module("https://tfhub.dev/google/albert_{}/{}".format(model_size.lower(), version), trainable=False)
	albert_inputs = dict(input_ids=tf_input_ids, input_mask=tf_input_mask, segment_ids=tf_segment_ids)

	# Get the model outputs
	albert_outputs = model(albert_inputs, signature="tokens", as_dict=True)
	albert_mlm_outputs = model({**albert_inputs, "mlm_positions": tf_mlm_positions}, signature="mlm", as_dict=True)["mlm_logits"]
	pooled_output = albert_outputs["pooled_output"]
	sequence_output = albert_outputs["sequence_output"]

	init = tf.global_variables_initializer()
	with tf.Session() as sess:
	sess.run(init)

	# Using the HUB module
	hub_pooled_output = sess.run(pooled_output)
	hub_sequence_output = sess.run(sequence_output)
	hub_mlm_outputs = sess.run(albert_mlm_outputs)

	# Using the TF1 implementation
	tf_embedding = sess.run(albert_module_modeling.get_embedding_output())
	tf_pooled = sess.run(albert_module_modeling.get_pooled_output())
	tf_sequence = sess.run(albert_module_modeling.get_sequence_output())

	# Calculate the difference between two tensors
	def difference_between_tensors(tf_tensor, tf_tensor_2):
	tf_np = np.array(tf_tensor)
	tf_np_2 = np.array(tf_tensor_2)
	return np.max(np.abs(tf_np - tf_np_2))

	print("\nComparing the HUB and TF1 layers")
	print("-- pooled ", difference_between_tensors(hub_pooled_output, tf_pooled))
	print("-- full transformer ", difference_between_tensors(hub_sequence_output, tf_sequence))