Hannes Hapke hanneshapke

## bert_data_structure.py
{
    'input_mask': array(
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
    'input_type_ids': array(
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
    'input_word_ids': array(
        [  101,  2023,  3319,  3397, 27594,  2545,  2005,  2216,  2040, ..., 2014,   102]),
    'label': array([0])
}

## mirrored_strategy.py
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = get_model(tf_transform_output=tf_transform_output)

## run_fn.py
def run_fn(fn_args: TrainerFnArgs):
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
    train_dataset = _input_fn(
        fn_args.train_files, tf_transform_output, 32)
    eval_dataset = _input_fn(
        fn_args.eval_files, tf_transform_output, 32)

    mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = get_model(tf_transform_output=tf_transform_output)

## cast_between_tft_tfhub.py
input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)

## model_architecture.py
    feature_spec = tf_transform_output.transformed_feature_spec()
    feature_spec.pop(_LABEL_KEY)

    inputs = {key: tf.keras.layers.Input(shape=(max_seq_length),
                                         name=key, dtype=tf.int32)
              for key in feature_spec.keys()}

    input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
    input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
    input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)

## input_type_ids.py
input_type_ids = tf.zeros_like(input_mask)

## preprocessing_fn.py
def preprocessing_fn(inputs):

    def tokenize_text(text, sequence_length=MAX_SEQ_LEN):
	  ...
        return tf.reshape(tokens, [-1, sequence_length])

    def preprocess_bert_input(text, segment_id=0):
        input_word_ids = tokenize_text(text)
        ...
        return (

## adding_of_CLS_and_SEP_tokens.py
CLS_ID = tf.constant(101, dtype=tf.int64)
SEP_ID = tf.constant(102, dtype=tf.int64)

start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)
end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)

tokens = tokens[:, :sequence_length - 2]
tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)

## call_berttokenizer.py
tokens = bert_tokenizer.tokenize(text)

## partial_setup_of_berttokenizer_part_3.py
bert_tokenizer = text.BertTokenizer(
    vocab_lookup_table=vocab_file_path,
    token_out_type=tf.int64,
    lower_case=do_lower_case
)
	{
	'input_mask': array(
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
	'input_type_ids': array(
	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
	'input_word_ids': array(
	[ 101, 2023, 3319, 3397, 27594, 2545, 2005, 2216, 2040, ..., 2014, 102]),
	'label': array([0])
	}
	mirrored_strategy = tf.distribute.MirroredStrategy()
	with mirrored_strategy.scope():
	model = get_model(tf_transform_output=tf_transform_output)
	def run_fn(fn_args: TrainerFnArgs):
	tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
	train_dataset = _input_fn(
	fn_args.train_files, tf_transform_output, 32)
	eval_dataset = _input_fn(
	fn_args.eval_files, tf_transform_output, 32)

	mirrored_strategy = tf.distribute.MirroredStrategy()
	with mirrored_strategy.scope():
	model = get_model(tf_transform_output=tf_transform_output)
	input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
	input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
	input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)
	feature_spec = tf_transform_output.transformed_feature_spec()
	feature_spec.pop(_LABEL_KEY)

	inputs = {key: tf.keras.layers.Input(shape=(max_seq_length),
	name=key, dtype=tf.int32)
	for key in feature_spec.keys()}

	input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
	input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
	input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)
	def preprocessing_fn(inputs):

	def tokenize_text(text, sequence_length=MAX_SEQ_LEN):
	...
	return tf.reshape(tokens, [-1, sequence_length])

	def preprocess_bert_input(text, segment_id=0):
	input_word_ids = tokenize_text(text)
	...
	return (
	CLS_ID = tf.constant(101, dtype=tf.int64)
	SEP_ID = tf.constant(102, dtype=tf.int64)

	start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)
	end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)

	tokens = tokens[:, :sequence_length - 2]
	tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)
	bert_tokenizer = text.BertTokenizer(
	vocab_lookup_table=vocab_file_path,
	token_out_type=tf.int64,
	lower_case=do_lower_case
	)