Skip to content

Instantly share code, notes, and snippets.

@datasciencemonkey
Created September 13, 2023 21:57
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save datasciencemonkey/e5944c3e85c853883d27909ba5f32dc1 to your computer and use it in GitHub Desktop.
Save datasciencemonkey/e5944c3e85c853883d27909ba5f32dc1 to your computer and use it in GitHub Desktop.
train a gptq model using peft/lora
# %%
# this is run from /notebooks on paperspace
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv("/notebooks/.env")
import os
os.environ["TOKENIZERS_PARALLELISM"]="false"
login(token=os.getenv("HUGGINGFACE_TOKEN"))
# %%
from datasets import load_dataset
dataset_name = "knkarthick/dialogsum"
ds = load_dataset(dataset_name)
# %%
train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
# %%
import pandas as pd
# convert to pandas
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)
train_df.head()
# %%
# instruction finetuning data preparation function
def prepare_dataset(df, split="train"):
text_col = []
instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task
if split == "train":
for _, row in df.iterrows():
input_q = row["dialogue"]
output = row["summary"]
text = (
"### Instruction: \n"
+ instruction
+ "\n### Input: \n"
+ input_q
+ "\n### Response :\n"
+ output
+ "\n### End"
) # keeping output column in training dataset
text_col.append(text)
df.loc[:, "text"] = text_col
else:
for _, row in df.iterrows():
input_q = row["dialogue"]
text = (
"### Instruction: \n"
+ instruction
+ "\n### Input: \n"
+ input_q
+ "\n### Response :\n"
) # not keeping output column in test dataset
text_col.append(text)
df.loc[:, "text"] = text_col
return df
train_df = prepare_dataset(train_df, "train")
test_df = prepare_dataset(test_df, "test")
print(train_df.iloc[0].text)
# %%
# coverting the dataframe to huggingface dataset for easy finetuning
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# sharded model path in hugging face
# model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
# model_name = 'NousResearch/Llama-2-7b-hf'
# Quantization config
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype="float16",
# )
# take pre quantized model from hugging face
model_id = "TheBloke/Llama-2-7B-GPTQ"
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# loading the model with quantization config
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# quantization_config=bnb_config,
# trust_remote_code=True,
# device_map='auto'
# )
# can change to False if need the newest model update.
# %%
from peft import prepare_model_for_kbit_training
from transformers import GPTQConfig
# model_id = "TheBloke/Llama-2-7B-GPTQ"
# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
model = AutoModelForCausalLM.from_pretrained(
model_id, quantization_config=quantization_config_loading, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.config.use_cache = False
model.config.pretraining_tp = 1
# %%
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
from peft import LoraConfig, get_peft_model
config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["k_proj","o_proj","q_proj","v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
# %%
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
# needed for llama 2 tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
args=TrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=100,
learning_rate=2e-4,
fp16=True, #use mixed precision training
logging_steps=1,
output_dir="outputs_gptq_training",
optim="adamw_hf",
save_strategy="epoch",
report_to="none")
from trl import SFTTrainer
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
peft_config=config,
dataset_text_field="text",
tokenizer=tokenizer,
packing=False,
max_seq_length=512)
# %%
train_result = trainer.train()
# %%
checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft"
#to merge and save the model
output_dir = os.path.join(args.output_dir, checkpoint_name)
trainer.model.save_pretrained(output_dir)
# To perform inference on the test dataset
# this is one way, but loading from dir is better!
# from peft import PeftModel
# from rich import print
# base_model = AutoModelForCausalLM.from_pretrained(
# model_id,
# return_dict=True,
# torch_dtype=torch.float16,
# device_map='auto'
# )
# model = PeftModel.from_pretrained(base_model,
# model_id=output_dir,
# device_map='auto',
# )
# from peft import LoraConfig, get_peft_model
# lora_config = LoraConfig.from_pretrained(output_dir)
# model = get_peft_model(model, lora_config)
from peft import AutoPeftModelForCausalLM
# To perform inference on the test dataset example load the model from the checkpoint
persisted_model = AutoPeftModelForCausalLM.from_pretrained(
output_dir,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map="cuda",
)
# %%
#inference on test data example
from time import perf_counter
from rich import print
from transformers import GenerationConfig
text = test_df['text'][4]
inputs = tokenizer(text, return_tensors="pt").to('cuda')
generation_config = GenerationConfig(
penalty_alpha=0.6,
do_sample = True,
top_k=5,
temperature=0.5,
repetition_penalty=1.2,
max_new_tokens=100
)
start_time = perf_counter()
outputs = persisted_model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {round(output_time,2)} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment