Skip to content

Instantly share code, notes, and snippets.

@papapabi
Created April 2, 2020 05:08
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save papapabi/124c6ac406e6bbd1f28df732e953ac6d to your computer and use it in GitHub Desktop.
Save papapabi/124c6ac406e6bbd1f28df732e953ac6d to your computer and use it in GitHub Desktop.
Using TFBertForSequenceClassification in a custom training loop
import pandas as pd
import tensorflow as tf
from collections import namedtuple
from typing import List, Tuple
from transformers import (
BertConfig,
BertTokenizer,
TFBertForSequenceClassification
)
EPOCHS = 3
BATCH_SIZE = 16
TO_FINETUNE = 'bert-case-based'
# InputExample is just an intermediary consruct to pair strings with their labels
InputExample = namedtuple('InputExample', ['text', 'category_index'])
# InputFeatures is just an intermediary construct to easily convert to a tf.data.Dataset
InputFeatures = namedtuple('InputFeatures', ['input_ids', 'attention_mask', 'token_type_ids', 'label'])
# pd.DataFrame with 'text' and 'category_index' columns as per your example
# i'm assuming values in text are str, category_index are int
df = pd.read_csv("foo.csv")
# Get total number of labels in the df
num_labels = df['category_index'].nunique()
num_examples = df.shape[0] # row count
examples = []
for row in df.itertuples(index=False):
examples.append(InputExample(text=row.text, category_index=row.category_index))
config = BertConfig.from_pretrained(TO_FINETUNE, num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(TO_FINETUNE)
def convert_examples_to_tf_dataset(
examples: List[Tuple[str, int]],
tokenizer,
max_length=512,
):
"""
Loads data into a tf.data.Dataset for finetuning a given model.
Args:
examples: List of tuples representing the examples to be fed
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum string length
Returns:
a ``tf.data.Dataset`` containing the condensed features of the provided sentences
"""
features = [] # -> will hold InputFeatures to be converted later
for e in examples:
# Documentation is really strong for this method, so please take a look at it
input_dict = tokenizer.encode_plus(
e.text,
add_special_tokens=True,
max_length=max_length, # truncates if len(s) > max_length
return_token_type_ids=True,
return_attention_mask=True,
pad_to_max_length=True, # pads to the right by default
)
# input ids = token indices in the tokenizer's internal dict
# token_type_ids = binary mask identifying different sequences in the model
# attention_mask = binary mask indicating the positions of padded tokens so the model does not attend to them
input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
input_dict["token_type_ids"], input_dict['attention_mask'])
features.append(
InputFeatures(
input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.category_index
)
)
def gen():
for f in features:
yield (
{
"input_ids": f.input_ids,
"attention_mask": f.attention_mask,
"token_type_ids": f.token_type_ids,
},
f.label,
)
return tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([]),
),
)
# Make the CPU do all data pre-processing steps, not the GPU
with tf.device('/cpu:0'):
train_data = convert_examples_to_tf_dataset(examples, tokenizer)
train_data = train_data.shuffle(buffer_size=num_examples, reshuffle_each_iteration=True) \
.batch(BATCH_SIZE) \
.repeat(-1)
config = BertConfig.from_pretrained(TO_FINETUNE)
model = TFBertForSequenceClassification.from_pretrained(TO_FINETUNE, config=config)
# train_data is then a tf.data.Dataset we can pass to model.fit()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalCrossentropy(name='accuracy')
model.compile(optimizer=optimizer,
loss=loss,
metrics=[metric])
train_steps = num_examples // BATCH_SIZE
history = model.fit(train_data,
epochs=EPOCHS,
steps_per_epoch=train_steps)
@marquesarthur
Copy link

I am trying to understand your example of using Bert for sequence classification with a custom example.
My question is fairly simple: what is the need for:

train_data = train_data.shuffle(buffer_size=num_examples, reshuffle_each_iteration=True) \
        .batch(BATCH_SIZE) \
        .repeat(-1)

If I don't shuffle the data, I get an error:

ValueError: slice index 0 of dimension 0 out of bounds.
for '{{node strided_slice}} = StridedSlice[Index=DT_INT32, T=DT_INT32, begin_mask=0,
 ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1](Shape, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)' 
with input shapes: [0], [1], [1], [1] and with computed input tensors: 
input[1] = <0>, input[2] = <1>, input[3] = <1>.

So, to me, it seems that shuffling is not only shuffling but doing something else and I don't exactly understand what.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment