-
-
Save jiahao87/50cec29725824da7ff6dd9314b53c4b3 to your computer and use it in GitHub Desktop.
"""Script for fine-tuning Pegasus | |
Example usage: | |
# use XSum dataset as example, with first 1000 docs as training data | |
from datasets import load_dataset | |
dataset = load_dataset("xsum") | |
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000] | |
# use Pegasus Large model as base for fine-tuning | |
model_name = 'google/pegasus-large' | |
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels) | |
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset) | |
trainer.train() | |
Reference: | |
https://huggingface.co/transformers/master/custom_datasets.html | |
""" | |
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments | |
import torch | |
class PegasusDataset(torch.utils.data.Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels['input_ids']) # len(self.labels) | |
def prepare_data(model_name, | |
train_texts, train_labels, | |
val_texts=None, val_labels=None, | |
test_texts=None, test_labels=None): | |
""" | |
Prepare input data for model fine-tuning | |
""" | |
tokenizer = PegasusTokenizer.from_pretrained(model_name) | |
prepare_val = False if val_texts is None or val_labels is None else True | |
prepare_test = False if test_texts is None or test_labels is None else True | |
def tokenize_data(texts, labels): | |
encodings = tokenizer(texts, truncation=True, padding=True) | |
decodings = tokenizer(labels, truncation=True, padding=True) | |
dataset_tokenized = PegasusDataset(encodings, decodings) | |
return dataset_tokenized | |
train_dataset = tokenize_data(train_texts, train_labels) | |
val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None | |
test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None | |
return train_dataset, val_dataset, test_dataset, tokenizer | |
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'): | |
""" | |
Prepare configurations and base model for fine-tuning | |
""" | |
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) | |
if freeze_encoder: | |
for param in model.model.encoder.parameters(): | |
param.requires_grad = False | |
if val_dataset is not None: | |
training_args = TrainingArguments( | |
output_dir=output_dir, # output directory | |
num_train_epochs=2000, # total number of training epochs | |
per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows | |
per_device_eval_batch_size=1, # batch size for evaluation, can increase if memory allows | |
save_steps=500, # number of updates steps before checkpoint saves | |
save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints | |
evaluation_strategy='steps', # evaluation strategy to adopt during training | |
eval_steps=100, # number of update steps before evaluation | |
warmup_steps=500, # number of warmup steps for learning rate scheduler | |
weight_decay=0.01, # strength of weight decay | |
logging_dir='./logs', # directory for storing logs | |
logging_steps=10, | |
) | |
trainer = Trainer( | |
model=model, # the instantiated 🤗 Transformers model to be trained | |
args=training_args, # training arguments, defined above | |
train_dataset=train_dataset, # training dataset | |
eval_dataset=val_dataset, # evaluation dataset | |
tokenizer=tokenizer | |
) | |
else: | |
training_args = TrainingArguments( | |
output_dir=output_dir, # output directory | |
num_train_epochs=2000, # total number of training epochs | |
per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows | |
save_steps=500, # number of updates steps before checkpoint saves | |
save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints | |
warmup_steps=500, # number of warmup steps for learning rate scheduler | |
weight_decay=0.01, # strength of weight decay | |
logging_dir='./logs', # directory for storing logs | |
logging_steps=10, | |
) | |
trainer = Trainer( | |
model=model, # the instantiated 🤗 Transformers model to be trained | |
args=training_args, # training arguments, defined above | |
train_dataset=train_dataset, # training dataset | |
tokenizer=tokenizer | |
) | |
return trainer | |
if __name__=='__main__': | |
# use XSum dataset as example, with first 1000 docs as training data | |
from datasets import load_dataset | |
dataset = load_dataset("xsum") | |
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000] | |
# use Pegasus Large model as base for fine-tuning | |
model_name = 'google/pegasus-large' | |
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels) | |
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset) | |
trainer.train() |
Hey, I have been trying to fine-tune PEGASUS-large in google colab(basic with 12GB ram) but got crashed all the time. Anybody can help me with this issue and also suggest me best arg so that it can run on colab.
Thanks :)
Hey @Darshan2104 ,
I have faced this problem before, unfortunately you won't be able to fine-tune pegasus using basic colab. You will need to subscribe to colab pro as the computational power needed for fine-tuning pegasus is quite big. Also make sure to use gpu memory of 16280MB.
@Darshan2104
freeze the encoder as shown below
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=True, output_dir='results')
also reduce the batch size to 2/4/8 and also try reducing the training data to first 1000 rows.
I was able to fine-tune PEGASUS-large by doing things mentioned.
Hello
Do you have a separate code for the encoder-decoder model used for the pegasus model?
Hi
Thank you for this easy to understand fine-tuning script. I am fine-tuning pegasus-wikihow on Google Colab with 1000 examples of a custom dataset and device batch size =2.
I was wondering whether someone experimented with fp16 training parameter to train faster. Please let me know in case it worked for you.
Thank you for the script.
Here are a few concerns, I am facing while tuning.
- The tuning consumes too much space (I ran it on Kaggle), and the output directory got full on 8 epochs only with 400 Training samples.
- Additionally, I can't see where the model is saved, rather all memory is taken up by checkpoints.
- Lastly, Do you have code for the inference/validation part?
I do not understand where is the saved model. I specified
Hi, @jiahao87, I have been trying to run your script in a Notebook instance in AWS sagemaker which has 8 GPU, each of 12 GB. Everytime, I am trying to run your script with absolutely no change, I am getting the following error:
RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 11.17 GiB total capacity; 10.49 GiB already allocated; 61.44 MiB free; 10.65 GiB reserved in total by PyTorch)
Could you please help?Hi @rishav2416, fine-tuning the full Pegasus large model is indeed resource intensive. I was only able to run the fine-tuning on Colab (GPU with 12GB RAM) when I freeze the encoder (see line below). Which notebook instance type are you using? You may wish to experiment with other instance types.
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, freeze_encoder=True)
If you do this freeze thing, does this decrease the performance? And would this also freeze the input embedding?
Also you should add trainer.save_model(" output_dir") . And these checkpoints do use a lot of space.
The trainer.save_model()
gave me the below cuda error.
RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
aoa i am mscs student and perform thesis on abstractive text summarization on pubmed dataset please some one guide me a platform which unlimited time access to train model or high memory i tried kaggle notebook,colab but these are not fulfill my requirement please some one suggest me a platform with free or low budget
Hi @jiahao87,
I tried your code with 1000 rows from cnn_dailymail, but in every try I keep having a very high validation loss, that goes down for the first few epochs, and then starts growing again. Moreover, the improvements in the results is not that remarkable as expected from a decent finetuning.
WRT the parameters, I tried yours and all combinations that could be compatible with Kaggle Notebooks/Colab Free limitations. Encoder is frozen.
Do you have some advice to reduce loss and improve results?
@keloemma, I don't think so. Labels are needed since we are doing supervised training, be it manually created labels or auto-generated labels.