simonmesmith/fine_tune_with_t5.md

## fine_tune_with_t5.md

      
    Raw
  

              fine_tune_with_t5.md
            
          
    Fine-tuning T5 with Hugging Face

Recently, I had to fine-tune a T5 model using Hugging Face's libraries.
Unfortunately, there was a lot of outdated information and many conflicting examples online.
If you just want to get going quickly, you can:

Copy this code
Swap in your own dataframe (ensure it has "source_text" and "target_text" columns, or modify parts of the code that depend on them accordingly)
Update the "output_directory" to your desired destination

And that should do it. Feedback welcome!

  
## fine_tune_with_t5.py
# 1. Install dependencies
# ------------------------------------------------------------------------------
# pip install datasets pandas transformers

# 2. Import libraries and modules
# ------------------------------------------------------------------------------
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# 3. Set model, tokenizer, and data_collator variables
# ------------------------------------------------------------------------------
# See options besides t5-base here:
# https://huggingface.co/docs/transformers/model_doc/t5
# ------------------------------------------------------------------------------
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. Get data and divide into train, eval, and test sets
# ------------------------------------------------------------------------------
# Replace the dataframe with your own data. Ensure you use "source_text" and
# "target_text" as column names or you'll need to change the code below.
# Also note that we we use 80% of the data for training, 10% for evaluation,
# and 10% for testing. You can modify this as well.
# ------------------------------------------------------------------------------
df = pd.DataFrame({"source_text": [], "target_text": []})
train_df = df.sample(frac = 0.8)
eval_df = df.drop(train_df.index).sample(frac = 0.5)
test_df = df.drop(train_df.index).drop(eval_df.index)

# 5. Create a dataset dict from the dataframes
# ------------------------------------------------------------------------------
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "eval": Dataset.from_pandas(eval_df),
    "test": Dataset.from_pandas(test_df),
    })

# 6. Tokenize the dataset
# ------------------------------------------------------------------------------
# You can change the max_length to whatever makes sense for your data.
# ------------------------------------------------------------------------------
def tokenize(source_texts, target_texts):
    model_inputs = tokenizer(text=source_texts, max_length=512, truncation=True)
    labels = tokenizer(text_target=target_texts, max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = dataset.map(tokenize, input_columns=["source_text", "target_text"], remove_columns=["source_text", "target_text"])

# 7. Set training arguments
# ------------------------------------------------------------------------------
# Change "output_directory" to your desired output directory. You can also change the
# batch_size, learning_rate, num_train_epochs and other parameters here. See the
# documentation for more details:
# https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/trainer#transformers.TrainingArguments
# ------------------------------------------------------------------------------
training_arguments = Seq2SeqTrainingArguments(
    "output_directory",
    learning_rate=0.0001,
    weight_decay=0.01,
    fp16=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    report_to="all"
)

# 8. Create a trainer
# ------------------------------------------------------------------------------
trainer = Seq2SeqTrainer(
    model,
    training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# 9. Train the model
# ------------------------------------------------------------------------------
trainer.train()

# 10. Save the tokenizer and model
# ------------------------------------------------------------------------------
# Change "output_directory" to your desired output directory. Note that you can
# also run evaluation before or after saving the tokenizer and model.
# ------------------------------------------------------------------------------
tokenizer.save_pretrained("output_directory")
model.save_pretrained("output_directory")
	# 1. Install dependencies
	# ------------------------------------------------------------------------------
	# pip install datasets pandas transformers

	# 2. Import libraries and modules
	# ------------------------------------------------------------------------------
	from datasets import Dataset, DatasetDict
	import pandas as pd
	from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

	# 3. Set model, tokenizer, and data_collator variables
	# ------------------------------------------------------------------------------
	# See options besides t5-base here:
	# https://huggingface.co/docs/transformers/model_doc/t5
	# ------------------------------------------------------------------------------
	tokenizer = T5Tokenizer.from_pretrained("t5-base")
	model = T5ForConditionalGeneration.from_pretrained("t5-base")
	data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

	# 4. Get data and divide into train, eval, and test sets
	# ------------------------------------------------------------------------------
	# Replace the dataframe with your own data. Ensure you use "source_text" and
	# "target_text" as column names or you'll need to change the code below.
	# Also note that we we use 80% of the data for training, 10% for evaluation,
	# and 10% for testing. You can modify this as well.
	# ------------------------------------------------------------------------------
	df = pd.DataFrame({"source_text": [], "target_text": []})
	train_df = df.sample(frac = 0.8)
	eval_df = df.drop(train_df.index).sample(frac = 0.5)
	test_df = df.drop(train_df.index).drop(eval_df.index)

	# 5. Create a dataset dict from the dataframes
	# ------------------------------------------------------------------------------
	dataset = DatasetDict({
	"train": Dataset.from_pandas(train_df),
	"eval": Dataset.from_pandas(eval_df),
	"test": Dataset.from_pandas(test_df),
	})

	# 6. Tokenize the dataset
	# ------------------------------------------------------------------------------
	# You can change the max_length to whatever makes sense for your data.
	# ------------------------------------------------------------------------------
	def tokenize(source_texts, target_texts):
	model_inputs = tokenizer(text=source_texts, max_length=512, truncation=True)
	labels = tokenizer(text_target=target_texts, max_length=512, truncation=True)
	model_inputs["labels"] = labels["input_ids"]
	return model_inputs
	tokenized_dataset = dataset.map(tokenize, input_columns=["source_text", "target_text"], remove_columns=["source_text", "target_text"])

	# 7. Set training arguments
	# ------------------------------------------------------------------------------
	# Change "output_directory" to your desired output directory. You can also change the
	# batch_size, learning_rate, num_train_epochs and other parameters here. See the
	# documentation for more details:
	# https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/trainer#transformers.TrainingArguments
	# ------------------------------------------------------------------------------
	training_arguments = Seq2SeqTrainingArguments(
	"output_directory",
	learning_rate=0.0001,
	weight_decay=0.01,
	fp16=True,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	gradient_accumulation_steps=2,
	num_train_epochs=20,
	evaluation_strategy="epoch",
	report_to="all"
	)

	# 8. Create a trainer
	# ------------------------------------------------------------------------------
	trainer = Seq2SeqTrainer(
	model,
	training_arguments,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset["eval"],
	data_collator=data_collator,
	tokenizer=tokenizer
	)

	# 9. Train the model
	# ------------------------------------------------------------------------------
	trainer.train()

	# 10. Save the tokenizer and model
	# ------------------------------------------------------------------------------
	# Change "output_directory" to your desired output directory. Note that you can
	# also run evaluation before or after saving the tokenizer and model.
	# ------------------------------------------------------------------------------
	tokenizer.save_pretrained("output_directory")
	model.save_pretrained("output_directory")