Skip to content

Instantly share code, notes, and snippets.

View hanneshapke's full-sized avatar

Hannes Hapke hanneshapke

View GitHub Profile
{
'input_mask': array(
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
'input_type_ids': array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'input_word_ids': array(
[ 101, 2023, 3319, 3397, 27594, 2545, 2005, 2216, 2040, ..., 2014, 102]),
'label': array([0])
}
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
model = get_model(tf_transform_output=tf_transform_output)
def run_fn(fn_args: TrainerFnArgs):
tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
train_dataset = _input_fn(
fn_args.train_files, tf_transform_output, 32)
eval_dataset = _input_fn(
fn_args.eval_files, tf_transform_output, 32)
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
model = get_model(tf_transform_output=tf_transform_output)
input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)
feature_spec = tf_transform_output.transformed_feature_spec()
feature_spec.pop(_LABEL_KEY)
inputs = {key: tf.keras.layers.Input(shape=(max_seq_length),
name=key, dtype=tf.int32)
for key in feature_spec.keys()}
input_word_ids = tf.cast(inputs["input_word_ids"], dtype=tf.int32)
input_mask = tf.cast(inputs["input_mask"], dtype=tf.int32)
input_type_ids = tf.cast(inputs["input_type_ids"], dtype=tf.int32)
input_type_ids = tf.zeros_like(input_mask)
def preprocessing_fn(inputs):
def tokenize_text(text, sequence_length=MAX_SEQ_LEN):
...
return tf.reshape(tokens, [-1, sequence_length])
def preprocess_bert_input(text, segment_id=0):
input_word_ids = tokenize_text(text)
...
return (
@hanneshapke
hanneshapke / adding_of_CLS_and_SEP_tokens.py
Created March 9, 2020 17:47
adding_of_CLS_and_SEP_tokens
CLS_ID = tf.constant(101, dtype=tf.int64)
SEP_ID = tf.constant(102, dtype=tf.int64)
start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)
end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)
tokens = tokens[:, :sequence_length - 2]
tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)
tokens = bert_tokenizer.tokenize(text)
bert_tokenizer = text.BertTokenizer(
vocab_lookup_table=vocab_file_path,
token_out_type=tf.int64,
lower_case=do_lower_case
)