Skip to content

Instantly share code, notes, and snippets.

@eusip
Last active February 16, 2024 20:03
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save eusip/de8fadb761741b56d5d9a6232bf979ed to your computer and use it in GitHub Desktop.
Save eusip/de8fadb761741b56d5d9a6232bf979ed to your computer and use it in GitHub Desktop.
PEFT training
import transformers
import argparse
import numpy as np
import pandas as pd
from huggingface_hub import HfFolder
import evaluate
from datasets import load_dataset, Dataset, load_metric, concatenate_datasets, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
os.environ["TOKENIZERS_PARALLELISM"]="false"
def main():
argParser = argparse.ArgumentParser()
argParser.add_argument("-dpf", "--data_path_file", help="./prompts.jsonl", required=True)
argParser.add_argument("-m", "--model_name", help="OpenAssistant/oasst-sft-1-pythia-12b", required=True)
argParser.add_argument(
"--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models"
)
argParser.add_argument("--deepspeed_config_file", type=str, default=None, help="Path to deepspeed config file.")
argParser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Flag for enabling gradient_checkpointing.")
argParser.add_argument("-te", "--training_epochs", help="3", required=True)
argParser.add_argument("-lr", "--learning_rate", help="2e-5", required=True)
argParser.add_argument("-es", "--eval_size", help="0.1", required=True)
argParser.add_argument("-bs", "--batch_size", help="1", required=True)
argParser.add_argument("-la", "--lora_alpha", help="32")
argParser.add_argument("-ld", "--lora_dropout", help="0.05")
argParser.add_argument("--hf_token", type=str, default=HfFolder.get_token(), help="Token to use for uploading models to Hugging Face Hub.")
args = argParser.parse_args()
#ARGS
data_path_file = args.data_path_file
model_name = args.model_name
training_epochs = int(args.training_epochs)
lr = float(args.learning_rate)
eval_size = float(args.eval_size)
batch_size = int(args.batch_size)
lora_alpha = args.lora_alpha
lora_dropout = args.lora_dropout
metric = evaluate.load("rouge")
raw_dataset = pd.read_json(data_path_file, lines=True)
dataset = DatasetDict()
dataset['train'] = Dataset.from_dict(raw_dataset[:(int)(len(raw_dataset)*(1-eval_size))])
dataset['test'] = Dataset.from_pandas(pd.DataFrame([], columns=["prompt", "completion"]))
dataset['validation'] = Dataset.from_dict(raw_dataset[(int)(len(raw_dataset)*(1-eval_size)):])
"""### Apply LoRA
Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
"""
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules = ["query_key_value", "xxx"],
bias="none",
task_type=TaskType.CAUSAL_LM,
fan_in_fan_out=False,
)
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False if args.gradient_checkpointing else True)
# for name, param in model.named_parameters():
# if param.requires_grad==False:
# print('Name: ', name)
# NOTE: https://github.com/lvwerra/trl/blob/a2749d9e0c96198486b788875eda3b325f76a5c8/examples/sentiment/scripts/gpt-neox-20b_peft/gpt-neo-20b_sentiment_peft.py#L181
for param in model.parameters():
# freeze base model's layers
param.requires_grad = False
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2048)
tokenizer.pad_token = tokenizer.eos_token
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
print("BEFORE DECODE - PREDS ARE: ", preds)
print("BEFORE DECODE - TYPE OF 'PREDS': ", type(preds))
decoded_preds = tokenizer.batch_decode(preds)
print("AFTER DECODE - PREDS ARE: ", preds)
print("AFTER DECODE - TYPE OF 'PREDS': ", type(preds))
# Replace -100 in the labels as we can't decode them.
print("BEFORE DECODE - LABELS ARE: ", labels)
print("BEFORE DECODE - TYPE OF 'LABELS': ", type(labels))
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels)
print("AFTER DECODE - LABELS ARE: ", labels)
print("AFTER DECODE - TYPE OF 'LABELS': ", type(labels))
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
# NOTE: https://github.com/huggingface/transformers/issues/14827#issuecomment-997967830
def preprocess_function(examples):
inputs = [ex for ex in examples['prompt']]
targets = [ex for ex in examples['completion']]
model_inputs = tokenizer(inputs, max_length=2048, padding='max_length', truncation=True)
labels = tokenizer(targets, max_length=2048, padding='max_length', truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)
args = TrainingArguments(
f"./{model_name}-finetuned",
learning_rate=lr,
evaluation_strategy = 'epoch',
eval_steps=1,
weight_decay=0.01,
save_total_limit=10,
fp16=True,
deepspeed=args.deepspeed_config_file,
num_train_epochs=training_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_checkpointing=args.gradient_checkpointing,
gradient_accumulation_steps=8,
save_strategy = 'epoch',
load_best_model_at_end = True,
push_to_hub=True if args.repository_id else False,
hub_strategy="every_save",
hub_model_id=args.repository_id if args.repository_id else None,
hub_token=args.hf_token,
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
# compute_metrics=compute_metrics
)
model.config.use_cache = False
trainer.train()
# trainer.create_model_card()
# if args.repository_id:
# trainer.push_to_hub()
if __name__ == "__main__":
main()
import transformers
import argparse
import numpy as np
import pandas as pd
from huggingface_hub import HfFolder
from datasets import load_dataset, Dataset, load_metric, concatenate_datasets, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
def main():
argParser = argparse.ArgumentParser()
argParser.add_argument("-dpf", "--data_path_file", help="./prompts.jsonl", required=True)
argParser.add_argument("-m", "--model_name", help="OpenAssistant/oasst-sft-1-pythia-12b", required=True)
argParser.add_argument(
"--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models"
)
argParser.add_argument("--deepspeed_config_file", type=str, default=None, help="Path to deepspeed config file.")
argParser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Flag for enabling gradient_checkpointing")
argParser.add_argument("-te", "--training_epochs", help="3", required=True)
argParser.add_argument("-lr", "--learning_rate", help="2e-5", required=True)
argParser.add_argument("-es", "--eval_size", help="0.1", required=True)
argParser.add_argument("-bs", "--batch_size", help="1", required=True)
argParser.add_argument("-la", "--lora_alpha", help="32")
argParser.add_argument("-ld", "--lora_dropout", help="0.05")
argParser.add_argument("--hf_token", type=str, default=HfFolder.get_token(), help="Token to use for uploading models to Hugging Face Hub.")
args = argParser.parse_args()
#ARGS
data_path_file = args.data_path_file
model_name = args.model_name
training_epochs = int(args.training_epochs)
lr = float(args.learning_rate)
eval_size = float(args.eval_size)
batch_size = int(args.batch_size)
lora_alpha = args.lora_alpha
lora_dropout = args.lora_dropout
metric = load_metric("rouge")
raw_dataset = pd.read_json(data_path_file, lines=True)
dataset = DatasetDict()
dataset['train'] = Dataset.from_dict(raw_dataset[:(int)(len(raw_dataset)*(1-eval_size))])
dataset['test'] = Dataset.from_pandas(pd.DataFrame([], columns=["prompt", "completion"]))
dataset['validation'] = Dataset.from_dict(raw_dataset[(int)(len(raw_dataset)*(1-eval_size)):])
"""### Apply LoRA
Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
"""
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules = ["query_key_value", "xxx"],
bias="none",
task_type="CAUSAL_LM",
fan_in_fan_out=False,
)
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False if args.gradient_checkpointing else True)
# NOTE: https://github.com/lvwerra/trl/blob/a2749d9e0c96198486b788875eda3b325f76a5c8/examples/sentiment/scripts/gpt-neox-20b_peft/gpt-neo-20b_sentiment_peft.py#L181
# for name, param in model.named_parameters():
# freeze base model's layers
param.requires_grad = False
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2048)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
# set pad token
# tokenizer.pad_token = tokenizer.eos_token
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
# NOTE: https://github.com/huggingface/transformers/issues/14827#issuecomment-997967830
def preprocess_function(examples):
inputs = [ex for ex in examples['prompt']]
targets = [ex for ex in examples['completion']]
model_inputs = tokenizer(inputs, max_length=2048, padding='max_length', truncation=True)
labels = tokenizer(targets, max_length=2048, padding='max_length', truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)
args = TrainingArguments(
f"./{model_name}-finetuned",
learning_rate=lr,
evaluation_strategy = 'epoch',
eval_steps=1,
weight_decay=0.01,
save_total_limit=10,
fp16=True,
deepspeed=args.deepspeed_config_file,
num_train_epochs=training_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_checkpointing=args.gradient_checkpointing,
gradient_accumulation_steps=8,
save_strategy = 'epoch',
load_best_model_at_end = True,
push_to_hub=True if args.repository_id else False,
hub_strategy="every_save",
hub_model_id=args.repository_id if args.repository_id else None,
hub_token=args.hf_token,
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
model.config.use_cache = False
trainer.train()
trainer.create_model_card()
if args.repository_id:
trainer.push_to_hub()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment