Skip to content

Instantly share code, notes, and snippets.

@kusal1990
Created June 25, 2022 07:41
Show Gist options
  • Save kusal1990/575eeae3152738c41a7f9f05f00fb74b to your computer and use it in GitHub Desktop.
Save kusal1990/575eeae3152738c41a7f9f05f00fb74b to your computer and use it in GitHub Desktop.
from transformers import T5ForConditionalGeneration, AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-base")
model.cuda()
from transformers import get_linear_schedule_with_warmup
# Parameters:
lr = 1e-4
max_grad_norm = 1.0
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
# Store our loss and accuracy for plotting
train_loss_set = []
epochs = 4
for epoch in range(epochs):
# Training
# Set our model to training mode (as opposed to evaluation mode)
model.train()
# Tracking variables
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
running_loss = 0
# Train the data for one epoch
for step, batch in enumerate(train_dataloader):
# Add batch to GPU
# batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_decoder_ids, b_labels = batch['input_ids'], batch['attention_mask'], batch['decoder_input_ids'], batch['labels']
###############Bug fix code####################
b_input_ids = b_input_ids.type(torch.LongTensor)
b_input_mask = b_input_mask.type(torch.LongTensor)
b_decoder_ids = b_decoder_ids.type(torch.LongTensor)
b_labels = b_labels.type(torch.LongTensor)
b_input_ids = b_input_ids.to(device)
b_input_mask = b_input_mask.to(device)
b_decoder_ids = b_decoder_ids.to(device)
b_labels = b_labels.to(device)
############################################
# Clear out the gradients (by default they accumulate)
optimizer.zero_grad()
# Forward pass
outputs = model(input_ids = b_input_ids, attention_mask=b_input_mask, decoder_input_ids=b_decoder_ids, labels=b_labels)
loss, logits = outputs[:2]
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
optimizer.step()
scheduler.step()
running_loss = loss.item()
if step%100 == 99:
print(f'Epoch:{epoch} Batch:{step} Loss:{running_loss}')
running_loss = 0
print('Training finished.')
@kusal1990
Copy link
Author

ok

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment