Created
October 16, 2023 14:32
-
-
Save sids07/8c2deb922c39b6ae211ce35cfa916aef to your computer and use it in GitHub Desktop.
training on multi-gpu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
DataCollatorForLanguageModeling, | |
PreTrainedTokenizer, | |
Trainer, | |
TrainingArguments, | |
set_seed, | |
get_linear_schedule_with_warmup | |
) | |
import numpy as np | |
from datasets import Dataset | |
from typing import List, Any, Union, Dict | |
from transformers import DataCollatorForLanguageModeling | |
from accelerate import Accelerator, DistributedType | |
from torch.utils.data import DataLoader | |
import torch | |
import pandas as pd | |
from torch.optim import AdamW | |
from tqdm import tqdm | |
import math | |
from peft import LoraConfig, TaskType, get_peft_model | |
def get_open_orca_format(instruction, inp, res): | |
if pd.isnull(instruction): | |
return f"""<|im_start|>user | |
{inp.replace("###","").strip()} | |
<|im_end|> | |
<|im_start|>assistant | |
{res.strip()} """ | |
else: | |
return f"""<|imstart|>system | |
{instruction.strip()} | |
<|im_end|> | |
<|im_start|>user | |
{inp.replace("###","").strip()} | |
<|im_end|> | |
<|im_start|>assistant | |
{res.strip()} """ | |
def alpaca_format(instruction, inp, res): | |
if pd.isnull(instruction): | |
return f"""### Input: | |
{inp.replace("###","").strip()} | |
### Response: | |
{res.strip()} """ | |
else: | |
return f"""### Instruction: | |
{instruction.strip()} | |
### Input: | |
{inp.replace("###","").strip()} | |
### Response: | |
{res.strip()} """ | |
def prepare_datasets(train_file_path): | |
df = pd.read_csv(train_file_path) | |
df["openorca_text"]= df.apply(lambda x: alpaca_format(x.Instruction, x.Input, x.Response), axis=1) | |
dataset = Dataset.from_pandas(df) | |
return dataset | |
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling): | |
def torch_call(self, examples): | |
batch = super().torch_call(examples) | |
response_token_ids = self.tokenizer.encode(RESPONSE_KEY) | |
response_token_ids = response_token_ids[1:] | |
labels = batch["labels"].clone() | |
for i in range(len(examples)): | |
response_token_ids_start_idx = None | |
for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]: | |
if np.array_equal(response_token_ids, batch["labels"][i, idx : idx + len(response_token_ids)]): | |
response_token_ids_start_idx = idx | |
break | |
if response_token_ids_start_idx is None: | |
raise RuntimeError("Could not find response key token IDs") | |
response_token_ids_end_idx = response_token_ids_start_idx + len(response_token_ids) | |
# Make pytorch loss function ignore all tokens up through the end of the response key | |
labels[i, :response_token_ids_end_idx] = -100 | |
batch["labels"] = labels | |
return batch | |
def get_dataloader(accelerate: Accelerator): | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
data_collator = DataCollatorForCompletionOnlyLM( | |
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8 | |
) | |
def tokenize_function(example): | |
return tokenizer( | |
example["openorca_text"], | |
max_length = max_length, | |
truncation = True) | |
dataset = prepare_datasets(train_file_path) | |
with accelerate.main_process_first(): | |
tokenized_data = dataset.map( | |
tokenize_function, | |
batched=True, | |
# remove_columns = ["Instruction","Input","Response","text","openorca_text"], | |
remove_columns = ["Instruction","Input","Response","openorca_text"] | |
) | |
split_dataset = tokenized_data.train_test_split( | |
test_size=test_size, | |
seed=seed | |
) | |
train_dataloader = DataLoader( | |
split_dataset["train"], | |
shuffle = True, | |
collate_fn = data_collator, | |
batch_size = train_batch_size, | |
drop_last = True | |
) | |
eval_dataloader = DataLoader( | |
split_dataset["test"], | |
shuffle = False, | |
collate_fn = data_collator, | |
batch_size = eval_batch_size, | |
drop_last = (accelerate.mixed_precision == "fp8") | |
) | |
return train_dataloader, eval_dataloader | |
def training(): | |
accelerator = Accelerator( | |
mixed_precision = "fp16" | |
) | |
set_seed(seed) | |
train_dataloader, eval_dataloader = get_dataloader(accelerator) | |
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, | |
inference_mode=False, | |
r=8, | |
lora_alpha=16, | |
lora_dropout=0.1, | |
target_modules = [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj", | |
"lm_head" | |
] | |
) | |
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False) | |
model = get_peft_model(model, peft_config) | |
accelerator.print(model.print_trainable_parameters()) | |
# model.to(accelerator.device) | |
optimizer = AdamW( | |
params = model.parameters(), | |
lr = lr | |
) | |
lr_scheduler = get_linear_schedule_with_warmup( | |
optimizer = optimizer, | |
num_warmup_steps = num_warmup_steps, | |
num_training_steps = (len(train_dataloader)* num_epochs) // gradient_accumulation_steps | |
) | |
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( | |
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler | |
) | |
print("All initialization Completed") | |
torch.cuda.empty_cache() | |
progress_bar = tqdm(math.ceil((len(train_dataloader)* num_epochs) // gradient_accumulation_steps)) | |
for epoch in range(num_epochs): | |
print("Training starts from here:") | |
model.train() | |
total_train_loss = 0 | |
for step, batch in enumerate(train_dataloader): | |
batch.to(accelerator.device) | |
outputs = model(**batch) | |
loss = outputs.loss | |
loss = loss/gradient_accumulation_steps | |
total_train_loss+=loss | |
accelerator.backward(loss) | |
if step % gradient_accumulation_steps == 0: | |
optimizer.step() | |
lr_scheduler.step() | |
optimizer.zero_grad() | |
progress_bar.update(1) | |
model.eval() | |
total_eval_loss = 0 | |
for step, batch in enumerate(eval_dataloader): | |
batch.to(accelerator.device) | |
with torch.no_grad(): | |
outputs = model(**batch) | |
eval_loss = outputs.loss | |
total_eval_loss += eval_loss | |
accelerator.print(f"Epochs : {epoch+1}, Training Loss: {total_train_loss/ len(train_dataloader)}, Validation Loss:{total_eval_loss/len(eval_dataloader)}") | |
accelerator.wait_for_everyone() | |
unwrapped_model = accelerator.unwrap_model(model) | |
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) | |
if __name__ == "__main__": | |
# RESPONSE_KEY = "<|im_start|>assistant" | |
RESPONSE_KEY = "### Response:" | |
train_file_path = "main_4k.csv" | |
# model_name = "Open-Orca/LlongOrca-13B-16k" | |
model_name = "lmsys/vicuna-13b-v1.5-16k" | |
test_size = 200 | |
seed= 42 | |
train_batch_size = 1 | |
eval_batch_size = 1 | |
max_length = 4800 | |
lr = 1e-5 | |
num_epochs = 3 | |
num_warmup_steps = 100 | |
gradient_accumulation_steps = 4 | |
output_dir = "models/" | |
training() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment