Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhulha/d33c75a78913921d38be9c5376c1cf80 to your computer and use it in GitHub Desktop.
Save rhulha/d33c75a78913921d38be9c5376c1cf80 to your computer and use it in GitHub Desktop.
import os
import time
import argparse
start = time.time()
os.system("nvidia-smi")
# import libraries
import torch
import transformers
from datasets import load_dataset, Dataset
from trl import SFTTrainer
import pandas as pd
def training_function(args):
lr = args.lr
num_epochs = args.num_epochs
seed = args.seed
transformers.set_seed(seed)
# print GPU available memory
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("Max VRAM (GB): ", max_memory)
pandas_dataset_stable_diffusion = pd.read_csv(args.dataset)
pandas_dataset_stable_diffusion = pandas_dataset_stable_diffusion.sample(frac=1)
dataset_stable_diffusion_train = Dataset.from_pandas(
pandas_dataset_stable_diffusion.iloc[0:140, :]
)
# remove old text cols
dataset_stable_diffusion_train = dataset_stable_diffusion_train.remove_columns(
[
col
for col in dataset_stable_diffusion_train.column_names
if col not in ["prompt", "response"]
]
)
print("Print an example in the train dataset:")
print(dataset_stable_diffusion_train)
print(dataset_stable_diffusion_train[0])
print("Final train dataset:")
train_dataset = dataset_stable_diffusion_train.shuffle(seed=43)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])
dataset_stable_diffusion_eval = Dataset.from_pandas(pandas_dataset_stable_diffusion.iloc[140:, :])
# remove old text cols
dataset_stable_diffusion_eval = dataset_stable_diffusion_eval.remove_columns(
[
col
for col in dataset_stable_diffusion_eval.column_names
if col not in ["prompt", "response"]
]
)
print("Print an example in the eval dataset:")
print(dataset_stable_diffusion_eval)
print(dataset_stable_diffusion_eval[0])
print("Final eval dataset:")
eval_dataset = dataset_stable_diffusion_eval.shuffle(seed=43)
print(eval_dataset)
print(eval_dataset[0])
print(eval_dataset[-1])
# let's now write a function to format the dataset for instruction fine-tuning
def formatting_prompts_func(dataset):
instructions = []
for i in range(len(dataset["prompt"])):
text = f"{dataset['prompt'][i]}\n{dataset['response'][i]}"
instructions.append(text)
return instructions
"""
## Loading the model
In this section we will load the [MPT-7B model](https://huggingface.co/mosaicml/mpt-7b).
"""
# load assets
model_id = args.model
print(type(model_id))
# mpt tokenizer load
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
# set mpt tokenizer padding token to eos token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
print(f"{model_id} tokenizer eos_token: ", tokenizer.eos_token)
print(f"{model_id} tokenizer pad_token: ", tokenizer.pad_token)
print(f"{model_id} tokenizer model_max_length: ", tokenizer.model_max_length)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("Max VRAM (GB): ", max_memory)
"""
## Loading the trainer
Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets. Let's first load the training arguments below.
from transformers import TrainingArguments
# see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
"""
output_dir = "./results"
num_train_epochs = num_epochs
auto_find_batch_size = False
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps = 1
save_strategy = "epoch"
learning_rate = lr
lr_scheduler_type = "linear"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 50
do_eval = True
evaluation_strategy = "steps"
prediction_loss_only = True
eval_steps = 0.2
training_arguments = transformers.TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
auto_find_batch_size=auto_find_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
save_strategy=save_strategy,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
warmup_ratio=warmup_ratio,
logging_strategy=logging_strategy,
logging_steps=logging_steps,
do_eval=do_eval,
evaluation_strategy=evaluation_strategy,
prediction_loss_only=prediction_loss_only,
eval_steps=eval_steps,
)
"""
Then finally pass everything to the trainer
"""
max_seq_length = tokenizer.model_max_length
trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
formatting_func=formatting_prompts_func,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
)
"""
## Train the model
Now let's train the model! Simply call `trainer.train()`
"""
trainer.train()
# finished: print GPU available memory and total time
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("Max VRAM (GB): ", max_memory)
end = time.time()
print("Total time (sec): ", end - start)
def main():
parser = argparse.ArgumentParser(
description="Simple example of a single GPU training script."
)
parser.add_argument(
"--model",
type=str,
help="Path for model folder or HF repository",
)
parser.add_argument(
"--dataset",
type=str,
help="Path for dataset",
)
parser.add_argument(
"--lr",
type=float,
default=2e-5,
help="Learning rate for training.",
)
parser.add_argument(
"--num_epochs",
type=int,
default=1,
help="Num training epochs.",
)
parser.add_argument(
"--seed",
type=int,
default=43,
help="Random seed.",
)
args = parser.parse_args()
print(f"Training args: {args}")
training_function(args)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment