Last active
February 16, 2024 20:03
-
-
Save eusip/de8fadb761741b56d5d9a6232bf979ed to your computer and use it in GitHub Desktop.
PEFT training
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import transformers | |
import argparse | |
import numpy as np | |
import pandas as pd | |
from huggingface_hub import HfFolder | |
import evaluate | |
from datasets import load_dataset, Dataset, load_metric, concatenate_datasets, DatasetDict | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer | |
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType | |
os.environ["TOKENIZERS_PARALLELISM"]="false" | |
def main(): | |
argParser = argparse.ArgumentParser() | |
argParser.add_argument("-dpf", "--data_path_file", help="./prompts.jsonl", required=True) | |
argParser.add_argument("-m", "--model_name", help="OpenAssistant/oasst-sft-1-pythia-12b", required=True) | |
argParser.add_argument( | |
"--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models" | |
) | |
argParser.add_argument("--deepspeed_config_file", type=str, default=None, help="Path to deepspeed config file.") | |
argParser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Flag for enabling gradient_checkpointing.") | |
argParser.add_argument("-te", "--training_epochs", help="3", required=True) | |
argParser.add_argument("-lr", "--learning_rate", help="2e-5", required=True) | |
argParser.add_argument("-es", "--eval_size", help="0.1", required=True) | |
argParser.add_argument("-bs", "--batch_size", help="1", required=True) | |
argParser.add_argument("-la", "--lora_alpha", help="32") | |
argParser.add_argument("-ld", "--lora_dropout", help="0.05") | |
argParser.add_argument("--hf_token", type=str, default=HfFolder.get_token(), help="Token to use for uploading models to Hugging Face Hub.") | |
args = argParser.parse_args() | |
#ARGS | |
data_path_file = args.data_path_file | |
model_name = args.model_name | |
training_epochs = int(args.training_epochs) | |
lr = float(args.learning_rate) | |
eval_size = float(args.eval_size) | |
batch_size = int(args.batch_size) | |
lora_alpha = args.lora_alpha | |
lora_dropout = args.lora_dropout | |
metric = evaluate.load("rouge") | |
raw_dataset = pd.read_json(data_path_file, lines=True) | |
dataset = DatasetDict() | |
dataset['train'] = Dataset.from_dict(raw_dataset[:(int)(len(raw_dataset)*(1-eval_size))]) | |
dataset['test'] = Dataset.from_pandas(pd.DataFrame([], columns=["prompt", "completion"])) | |
dataset['validation'] = Dataset.from_dict(raw_dataset[(int)(len(raw_dataset)*(1-eval_size)):]) | |
"""### Apply LoRA | |
Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. | |
""" | |
peft_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
lora_dropout=0.05, | |
target_modules = ["query_key_value", "xxx"], | |
bias="none", | |
task_type=TaskType.CAUSAL_LM, | |
fan_in_fan_out=False, | |
) | |
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False if args.gradient_checkpointing else True) | |
# for name, param in model.named_parameters(): | |
# if param.requires_grad==False: | |
# print('Name: ', name) | |
# NOTE: https://github.com/lvwerra/trl/blob/a2749d9e0c96198486b788875eda3b325f76a5c8/examples/sentiment/scripts/gpt-neox-20b_peft/gpt-neo-20b_sentiment_peft.py#L181 | |
for param in model.parameters(): | |
# freeze base model's layers | |
param.requires_grad = False | |
if hasattr(model, "enable_input_require_grads"): | |
model.enable_input_require_grads() | |
else: | |
def make_inputs_require_grad(module, input, output): | |
output.requires_grad_(True) | |
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
model = get_peft_model(model, peft_config) | |
model.print_trainable_parameters() | |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2048) | |
tokenizer.pad_token = tokenizer.eos_token | |
def compute_metrics(eval_preds): | |
preds, labels = eval_preds | |
if isinstance(preds, tuple): | |
preds = preds[0] | |
print("BEFORE DECODE - PREDS ARE: ", preds) | |
print("BEFORE DECODE - TYPE OF 'PREDS': ", type(preds)) | |
decoded_preds = tokenizer.batch_decode(preds) | |
print("AFTER DECODE - PREDS ARE: ", preds) | |
print("AFTER DECODE - TYPE OF 'PREDS': ", type(preds)) | |
# Replace -100 in the labels as we can't decode them. | |
print("BEFORE DECODE - LABELS ARE: ", labels) | |
print("BEFORE DECODE - TYPE OF 'LABELS': ", type(labels)) | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
decoded_labels = tokenizer.batch_decode(labels) | |
print("AFTER DECODE - LABELS ARE: ", labels) | |
print("AFTER DECODE - TYPE OF 'LABELS': ", type(labels)) | |
result = metric.compute(predictions=decoded_preds, references=decoded_labels) | |
result = {key: value.mid.fmeasure * 100 for key, value in result.items()} | |
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] | |
result["gen_len"] = np.mean(prediction_lens) | |
result = {k: round(v, 4) for k, v in result.items()} | |
return result | |
# NOTE: https://github.com/huggingface/transformers/issues/14827#issuecomment-997967830 | |
def preprocess_function(examples): | |
inputs = [ex for ex in examples['prompt']] | |
targets = [ex for ex in examples['completion']] | |
model_inputs = tokenizer(inputs, max_length=2048, padding='max_length', truncation=True) | |
labels = tokenizer(targets, max_length=2048, padding='max_length', truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_datasets = dataset.map(preprocess_function, batched=True) | |
args = TrainingArguments( | |
f"./{model_name}-finetuned", | |
learning_rate=lr, | |
evaluation_strategy = 'epoch', | |
eval_steps=1, | |
weight_decay=0.01, | |
save_total_limit=10, | |
fp16=True, | |
deepspeed=args.deepspeed_config_file, | |
num_train_epochs=training_epochs, | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
gradient_checkpointing=args.gradient_checkpointing, | |
gradient_accumulation_steps=8, | |
save_strategy = 'epoch', | |
load_best_model_at_end = True, | |
push_to_hub=True if args.repository_id else False, | |
hub_strategy="every_save", | |
hub_model_id=args.repository_id if args.repository_id else None, | |
hub_token=args.hf_token, | |
) | |
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
trainer = Trainer( | |
model, | |
args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["validation"], | |
data_collator=data_collator, | |
tokenizer=tokenizer, | |
# compute_metrics=compute_metrics | |
) | |
model.config.use_cache = False | |
trainer.train() | |
# trainer.create_model_card() | |
# if args.repository_id: | |
# trainer.push_to_hub() | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import transformers | |
import argparse | |
import numpy as np | |
import pandas as pd | |
from huggingface_hub import HfFolder | |
from datasets import load_dataset, Dataset, load_metric, concatenate_datasets, DatasetDict | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer | |
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType | |
def main(): | |
argParser = argparse.ArgumentParser() | |
argParser.add_argument("-dpf", "--data_path_file", help="./prompts.jsonl", required=True) | |
argParser.add_argument("-m", "--model_name", help="OpenAssistant/oasst-sft-1-pythia-12b", required=True) | |
argParser.add_argument( | |
"--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models" | |
) | |
argParser.add_argument("--deepspeed_config_file", type=str, default=None, help="Path to deepspeed config file.") | |
argParser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Flag for enabling gradient_checkpointing") | |
argParser.add_argument("-te", "--training_epochs", help="3", required=True) | |
argParser.add_argument("-lr", "--learning_rate", help="2e-5", required=True) | |
argParser.add_argument("-es", "--eval_size", help="0.1", required=True) | |
argParser.add_argument("-bs", "--batch_size", help="1", required=True) | |
argParser.add_argument("-la", "--lora_alpha", help="32") | |
argParser.add_argument("-ld", "--lora_dropout", help="0.05") | |
argParser.add_argument("--hf_token", type=str, default=HfFolder.get_token(), help="Token to use for uploading models to Hugging Face Hub.") | |
args = argParser.parse_args() | |
#ARGS | |
data_path_file = args.data_path_file | |
model_name = args.model_name | |
training_epochs = int(args.training_epochs) | |
lr = float(args.learning_rate) | |
eval_size = float(args.eval_size) | |
batch_size = int(args.batch_size) | |
lora_alpha = args.lora_alpha | |
lora_dropout = args.lora_dropout | |
metric = load_metric("rouge") | |
raw_dataset = pd.read_json(data_path_file, lines=True) | |
dataset = DatasetDict() | |
dataset['train'] = Dataset.from_dict(raw_dataset[:(int)(len(raw_dataset)*(1-eval_size))]) | |
dataset['test'] = Dataset.from_pandas(pd.DataFrame([], columns=["prompt", "completion"])) | |
dataset['validation'] = Dataset.from_dict(raw_dataset[(int)(len(raw_dataset)*(1-eval_size)):]) | |
"""### Apply LoRA | |
Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. | |
""" | |
peft_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
lora_dropout=0.05, | |
target_modules = ["query_key_value", "xxx"], | |
bias="none", | |
task_type="CAUSAL_LM", | |
fan_in_fan_out=False, | |
) | |
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=False if args.gradient_checkpointing else True) | |
# NOTE: https://github.com/lvwerra/trl/blob/a2749d9e0c96198486b788875eda3b325f76a5c8/examples/sentiment/scripts/gpt-neox-20b_peft/gpt-neo-20b_sentiment_peft.py#L181 | |
# for name, param in model.named_parameters(): | |
# freeze base model's layers | |
param.requires_grad = False | |
if hasattr(model, "enable_input_require_grads"): | |
model.enable_input_require_grads() | |
else: | |
def make_inputs_require_grad(module, input, output): | |
output.requires_grad_(True) | |
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
model = get_peft_model(model, peft_config) | |
model.print_trainable_parameters() | |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2048) | |
if tokenizer.pad_token_id is None: | |
tokenizer.pad_token_id = tokenizer.eos_token_id | |
# set pad token | |
# tokenizer.pad_token = tokenizer.eos_token | |
def compute_metrics(eval_preds): | |
preds, labels = eval_preds | |
if isinstance(preds, tuple): | |
preds = preds[0] | |
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) | |
# Replace -100 in the labels as we can't decode them. | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
result = metric.compute(predictions=decoded_preds, references=decoded_labels) | |
result = {key: value.mid.fmeasure * 100 for key, value in result.items()} | |
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] | |
result["gen_len"] = np.mean(prediction_lens) | |
result = {k: round(v, 4) for k, v in result.items()} | |
return result | |
# NOTE: https://github.com/huggingface/transformers/issues/14827#issuecomment-997967830 | |
def preprocess_function(examples): | |
inputs = [ex for ex in examples['prompt']] | |
targets = [ex for ex in examples['completion']] | |
model_inputs = tokenizer(inputs, max_length=2048, padding='max_length', truncation=True) | |
labels = tokenizer(targets, max_length=2048, padding='max_length', truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_datasets = dataset.map(preprocess_function, batched=True) | |
args = TrainingArguments( | |
f"./{model_name}-finetuned", | |
learning_rate=lr, | |
evaluation_strategy = 'epoch', | |
eval_steps=1, | |
weight_decay=0.01, | |
save_total_limit=10, | |
fp16=True, | |
deepspeed=args.deepspeed_config_file, | |
num_train_epochs=training_epochs, | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
gradient_checkpointing=args.gradient_checkpointing, | |
gradient_accumulation_steps=8, | |
save_strategy = 'epoch', | |
load_best_model_at_end = True, | |
push_to_hub=True if args.repository_id else False, | |
hub_strategy="every_save", | |
hub_model_id=args.repository_id if args.repository_id else None, | |
hub_token=args.hf_token, | |
) | |
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
trainer = Trainer( | |
model, | |
args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["validation"], | |
data_collator=data_collator, | |
tokenizer=tokenizer, | |
compute_metrics=compute_metrics | |
) | |
model.config.use_cache = False | |
trainer.train() | |
trainer.create_model_card() | |
if args.repository_id: | |
trainer.push_to_hub() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment