papapabi/custom_training_sample.py

## custom_training_sample.py
import pandas as pd
import tensorflow as tf

from collections import namedtuple
from typing import List, Tuple

from transformers import (
    BertConfig,
    BertTokenizer,
    TFBertForSequenceClassification
)

EPOCHS = 3
BATCH_SIZE = 16
TO_FINETUNE = 'bert-case-based'

# InputExample is just an intermediary consruct to pair strings with their labels
InputExample = namedtuple('InputExample', ['text', 'category_index'])

# InputFeatures is just an intermediary construct to easily convert to a tf.data.Dataset
InputFeatures = namedtuple('InputFeatures', ['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# pd.DataFrame with 'text' and 'category_index' columns as per your example
# i'm assuming values in text are str, category_index are int
df = pd.read_csv("foo.csv")

# Get total number of labels in the df
num_labels = df['category_index'].nunique()
num_examples = df.shape[0] # row count

examples = []

for row in df.itertuples(index=False):
    examples.append(InputExample(text=row.text, category_index=row.category_index))


config = BertConfig.from_pretrained(TO_FINETUNE, num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(TO_FINETUNE)


def convert_examples_to_tf_dataset(
    examples: List[Tuple[str, int]],
    tokenizer,
    max_length=512,
):
    """
    Loads data into a tf.data.Dataset for finetuning a given model.

    Args:
        examples: List of tuples representing the examples to be fed
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum string length

    Returns:
        a ``tf.data.Dataset`` containing the condensed features of the provided sentences
    """
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default
        )

        # input ids = token indices in the tokenizer's internal dict
        # token_type_ids = binary mask identifying different sequences in the model
        # attention_mask = binary mask indicating the positions of padded tokens so the model does not attend to them

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.category_index
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

# Make the CPU do all data pre-processing steps, not the GPU
with tf.device('/cpu:0'):
    train_data = convert_examples_to_tf_dataset(examples, tokenizer)
    train_data = train_data.shuffle(buffer_size=num_examples, reshuffle_each_iteration=True) \
                           .batch(BATCH_SIZE) \
                           .repeat(-1)

config = BertConfig.from_pretrained(TO_FINETUNE)
model = TFBertForSequenceClassification.from_pretrained(TO_FINETUNE, config=config)

# train_data is then a tf.data.Dataset we can pass to model.fit()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalCrossentropy(name='accuracy')
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=[metric])

train_steps = num_examples // BATCH_SIZE

history = model.fit(train_data,
                    epochs=EPOCHS,
                    steps_per_epoch=train_steps)
	import pandas as pd
	import tensorflow as tf

	from collections import namedtuple
	from typing import List, Tuple

	from transformers import (
	BertConfig,
	BertTokenizer,
	TFBertForSequenceClassification
	)

	EPOCHS = 3
	BATCH_SIZE = 16
	TO_FINETUNE = 'bert-case-based'

	# InputExample is just an intermediary consruct to pair strings with their labels
	InputExample = namedtuple('InputExample', ['text', 'category_index'])

	# InputFeatures is just an intermediary construct to easily convert to a tf.data.Dataset
	InputFeatures = namedtuple('InputFeatures', ['input_ids', 'attention_mask', 'token_type_ids', 'label'])

	# pd.DataFrame with 'text' and 'category_index' columns as per your example
	# i'm assuming values in text are str, category_index are int
	df = pd.read_csv("foo.csv")

	# Get total number of labels in the df
	num_labels = df['category_index'].nunique()
	num_examples = df.shape[0] # row count

	examples = []

	for row in df.itertuples(index=False):
	examples.append(InputExample(text=row.text, category_index=row.category_index))


	config = BertConfig.from_pretrained(TO_FINETUNE, num_labels=num_labels)
	tokenizer = BertTokenizer.from_pretrained(TO_FINETUNE)


	def convert_examples_to_tf_dataset(
	examples: List[Tuple[str, int]],
	tokenizer,
	max_length=512,
	):
	"""
	Loads data into a tf.data.Dataset for finetuning a given model.

	Args:
	examples: List of tuples representing the examples to be fed
	tokenizer: Instance of a tokenizer that will tokenize the examples
	max_length: Maximum string length

	Returns:
	a ``tf.data.Dataset`` containing the condensed features of the provided sentences
	"""
	features = [] # -> will hold InputFeatures to be converted later

	for e in examples:
	# Documentation is really strong for this method, so please take a look at it
	input_dict = tokenizer.encode_plus(
	e.text,
	add_special_tokens=True,
	max_length=max_length, # truncates if len(s) > max_length
	return_token_type_ids=True,
	return_attention_mask=True,
	pad_to_max_length=True, # pads to the right by default
	)

	# input ids = token indices in the tokenizer's internal dict
	# token_type_ids = binary mask identifying different sequences in the model
	# attention_mask = binary mask indicating the positions of padded tokens so the model does not attend to them

	input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
	input_dict["token_type_ids"], input_dict['attention_mask'])

	features.append(
	InputFeatures(
	input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.category_index
	)
	)

	def gen():
	for f in features:
	yield (
	{
	"input_ids": f.input_ids,
	"attention_mask": f.attention_mask,
	"token_type_ids": f.token_type_ids,
	},
	f.label,
	)

	return tf.data.Dataset.from_generator(
	gen,
	({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
	(
	{
	"input_ids": tf.TensorShape([None]),
	"attention_mask": tf.TensorShape([None]),
	"token_type_ids": tf.TensorShape([None]),
	},
	tf.TensorShape([]),
	),
	)

	# Make the CPU do all data pre-processing steps, not the GPU
	with tf.device('/cpu:0'):
	train_data = convert_examples_to_tf_dataset(examples, tokenizer)
	train_data = train_data.shuffle(buffer_size=num_examples, reshuffle_each_iteration=True) \
	.batch(BATCH_SIZE) \
	.repeat(-1)

	config = BertConfig.from_pretrained(TO_FINETUNE)
	model = TFBertForSequenceClassification.from_pretrained(TO_FINETUNE, config=config)

	# train_data is then a tf.data.Dataset we can pass to model.fit()
	optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
	loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
	metric = tf.keras.metrics.SparseCategoricalCrossentropy(name='accuracy')
	model.compile(optimizer=optimizer,
	loss=loss,
	metrics=[metric])

	train_steps = num_examples // BATCH_SIZE

	history = model.fit(train_data,
	epochs=EPOCHS,
	steps_per_epoch=train_steps)