Skip to content

Instantly share code, notes, and snippets.

@sids07
Created October 16, 2023 14:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sids07/8c2deb922c39b6ae211ce35cfa916aef to your computer and use it in GitHub Desktop.
Save sids07/8c2deb922c39b6ae211ce35cfa916aef to your computer and use it in GitHub Desktop.
training on multi-gpu
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
PreTrainedTokenizer,
Trainer,
TrainingArguments,
set_seed,
get_linear_schedule_with_warmup
)
import numpy as np
from datasets import Dataset
from typing import List, Any, Union, Dict
from transformers import DataCollatorForLanguageModeling
from accelerate import Accelerator, DistributedType
from torch.utils.data import DataLoader
import torch
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import math
from peft import LoraConfig, TaskType, get_peft_model
def get_open_orca_format(instruction, inp, res):
if pd.isnull(instruction):
return f"""<|im_start|>user
{inp.replace("###","").strip()}
<|im_end|>
<|im_start|>assistant
{res.strip()} """
else:
return f"""<|imstart|>system
{instruction.strip()}
<|im_end|>
<|im_start|>user
{inp.replace("###","").strip()}
<|im_end|>
<|im_start|>assistant
{res.strip()} """
def alpaca_format(instruction, inp, res):
if pd.isnull(instruction):
return f"""### Input:
{inp.replace("###","").strip()}
### Response:
{res.strip()} """
else:
return f"""### Instruction:
{instruction.strip()}
### Input:
{inp.replace("###","").strip()}
### Response:
{res.strip()} """
def prepare_datasets(train_file_path):
df = pd.read_csv(train_file_path)
df["openorca_text"]= df.apply(lambda x: alpaca_format(x.Instruction, x.Input, x.Response), axis=1)
dataset = Dataset.from_pandas(df)
return dataset
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
def torch_call(self, examples):
batch = super().torch_call(examples)
response_token_ids = self.tokenizer.encode(RESPONSE_KEY)
response_token_ids = response_token_ids[1:]
labels = batch["labels"].clone()
for i in range(len(examples)):
response_token_ids_start_idx = None
for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
if np.array_equal(response_token_ids, batch["labels"][i, idx : idx + len(response_token_ids)]):
response_token_ids_start_idx = idx
break
if response_token_ids_start_idx is None:
raise RuntimeError("Could not find response key token IDs")
response_token_ids_end_idx = response_token_ids_start_idx + len(response_token_ids)
# Make pytorch loss function ignore all tokens up through the end of the response key
labels[i, :response_token_ids_end_idx] = -100
batch["labels"] = labels
return batch
def get_dataloader(accelerate: Accelerator):
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForCompletionOnlyLM(
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)
def tokenize_function(example):
return tokenizer(
example["openorca_text"],
max_length = max_length,
truncation = True)
dataset = prepare_datasets(train_file_path)
with accelerate.main_process_first():
tokenized_data = dataset.map(
tokenize_function,
batched=True,
# remove_columns = ["Instruction","Input","Response","text","openorca_text"],
remove_columns = ["Instruction","Input","Response","openorca_text"]
)
split_dataset = tokenized_data.train_test_split(
test_size=test_size,
seed=seed
)
train_dataloader = DataLoader(
split_dataset["train"],
shuffle = True,
collate_fn = data_collator,
batch_size = train_batch_size,
drop_last = True
)
eval_dataloader = DataLoader(
split_dataset["test"],
shuffle = False,
collate_fn = data_collator,
batch_size = eval_batch_size,
drop_last = (accelerate.mixed_precision == "fp8")
)
return train_dataloader, eval_dataloader
def training():
accelerator = Accelerator(
mixed_precision = "fp16"
)
set_seed(seed)
train_dataloader, eval_dataloader = get_dataloader(accelerator)
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=16,
lora_dropout=0.1,
target_modules = [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head"
]
)
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False)
model = get_peft_model(model, peft_config)
accelerator.print(model.print_trainable_parameters())
# model.to(accelerator.device)
optimizer = AdamW(
params = model.parameters(),
lr = lr
)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer = optimizer,
num_warmup_steps = num_warmup_steps,
num_training_steps = (len(train_dataloader)* num_epochs) // gradient_accumulation_steps
)
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)
print("All initialization Completed")
torch.cuda.empty_cache()
progress_bar = tqdm(math.ceil((len(train_dataloader)* num_epochs) // gradient_accumulation_steps))
for epoch in range(num_epochs):
print("Training starts from here:")
model.train()
total_train_loss = 0
for step, batch in enumerate(train_dataloader):
batch.to(accelerator.device)
outputs = model(**batch)
loss = outputs.loss
loss = loss/gradient_accumulation_steps
total_train_loss+=loss
accelerator.backward(loss)
if step % gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
model.eval()
total_eval_loss = 0
for step, batch in enumerate(eval_dataloader):
batch.to(accelerator.device)
with torch.no_grad():
outputs = model(**batch)
eval_loss = outputs.loss
total_eval_loss += eval_loss
accelerator.print(f"Epochs : {epoch+1}, Training Loss: {total_train_loss/ len(train_dataloader)}, Validation Loss:{total_eval_loss/len(eval_dataloader)}")
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if __name__ == "__main__":
# RESPONSE_KEY = "<|im_start|>assistant"
RESPONSE_KEY = "### Response:"
train_file_path = "main_4k.csv"
# model_name = "Open-Orca/LlongOrca-13B-16k"
model_name = "lmsys/vicuna-13b-v1.5-16k"
test_size = 200
seed= 42
train_batch_size = 1
eval_batch_size = 1
max_length = 4800
lr = 1e-5
num_epochs = 3
num_warmup_steps = 100
gradient_accumulation_steps = 4
output_dir = "models/"
training()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment