datasciencemonkey/gptq_lora.py

## gptq_lora.py
# %%
# this is run from /notebooks on paperspace
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv("/notebooks/.env")
import os
os.environ["TOKENIZERS_PARALLELISM"]="false"
login(token=os.getenv("HUGGINGFACE_TOKEN"))

# %%
from datasets import load_dataset

dataset_name = "knkarthick/dialogsum"

ds = load_dataset(dataset_name)
# %%
train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
# %%
import pandas as pd

# convert to pandas
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)
train_df.head()
# %%
# instruction finetuning data preparation function


def prepare_dataset(df, split="train"):
    text_col = []
    instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses."""  # change instuction according to the task
    if split == "train":
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            output = row["summary"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
                + output
                + "\n### End"
            )  # keeping output column in training dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    else:
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
            )  # not keeping output column in test dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    return df

train_df = prepare_dataset(train_df, "train")
test_df = prepare_dataset(test_df, "test")
print(train_df.iloc[0].text)

# %%
# coverting the dataframe to huggingface dataset for easy finetuning
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# sharded model path in hugging face
# model_name = "TinyPixel/Llama-2-7B-bf16-sharded"


# model_name = 'NousResearch/Llama-2-7b-hf'

# Quantization config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="float16",
# )
# take pre quantized model from hugging face
model_id = "TheBloke/Llama-2-7B-GPTQ"
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)

# loading the model with quantization config
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     trust_remote_code=True,
#     device_map='auto'
# )


# can change to False if need the newest model update.

# %%
from peft import prepare_model_for_kbit_training
from transformers import GPTQConfig

# model_id = "TheBloke/Llama-2-7B-GPTQ"
# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=quantization_config_loading, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.config.use_cache = False
model.config.pretraining_tp = 1
# %%
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()
# %%
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# needed for llama 2 tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True, #use mixed precision training
        logging_steps=1,
        output_dir="outputs_gptq_training",
        optim="adamw_hf",
        save_strategy="epoch",
        report_to="none")

from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512)


# %%
train_result = trainer.train()

# %%
checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft"
#to merge and save the model
output_dir = os.path.join(args.output_dir, checkpoint_name)
trainer.model.save_pretrained(output_dir)

# To perform inference on the test dataset
# this is one way, but loading from dir is better!
# from peft import PeftModel
# from rich import print
# base_model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         return_dict=True,
#         torch_dtype=torch.float16,
#         device_map='auto'
#     )

# model = PeftModel.from_pretrained(base_model,
#                                   model_id=output_dir,
#                                   device_map='auto',
# )
# from peft import LoraConfig, get_peft_model
# lora_config = LoraConfig.from_pretrained(output_dir)
# model  = get_peft_model(model, lora_config)

from peft import AutoPeftModelForCausalLM
# To perform inference on the test dataset example load the model from the checkpoint
persisted_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda",
)

# %%
#inference on test data example
from time import perf_counter
from rich import print
from transformers import GenerationConfig
text = test_df['text'][4]
inputs = tokenizer(text, return_tensors="pt").to('cuda')
generation_config = GenerationConfig(
    penalty_alpha=0.6,
    do_sample = True,
    top_k=5,
    temperature=0.5,
    repetition_penalty=1.2,
    max_new_tokens=100
)
start_time = perf_counter()
outputs = persisted_model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {round(output_time,2)} seconds")
	# %%
	# this is run from /notebooks on paperspace
	from huggingface_hub import login
	from dotenv import load_dotenv

	load_dotenv("/notebooks/.env")
	import os
	os.environ["TOKENIZERS_PARALLELISM"]="false"
	login(token=os.getenv("HUGGINGFACE_TOKEN"))

	# %%
	from datasets import load_dataset

	dataset_name = "knkarthick/dialogsum"

	ds = load_dataset(dataset_name)
	# %%
	train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
	# %%
	import pandas as pd

	# convert to pandas
	train_df = pd.DataFrame(train_ds)
	test_df = pd.DataFrame(test_ds)
	train_df.head()
	# %%
	# instruction finetuning data preparation function


	def prepare_dataset(df, split="train"):
	text_col = []
	instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task
	if split == "train":
	for _, row in df.iterrows():
	input_q = row["dialogue"]
	output = row["summary"]
	text = (
	"### Instruction: \n"
	+ instruction
	+ "\n### Input: \n"
	+ input_q
	+ "\n### Response :\n"
	+ output
	+ "\n### End"
	) # keeping output column in training dataset
	text_col.append(text)
	df.loc[:, "text"] = text_col
	else:
	for _, row in df.iterrows():
	input_q = row["dialogue"]
	text = (
	"### Instruction: \n"
	+ instruction
	+ "\n### Input: \n"
	+ input_q
	+ "\n### Response :\n"
	) # not keeping output column in test dataset
	text_col.append(text)
	df.loc[:, "text"] = text_col
	return df

	train_df = prepare_dataset(train_df, "train")
	test_df = prepare_dataset(test_df, "test")
	print(train_df.iloc[0].text)

	# %%
	# coverting the dataframe to huggingface dataset for easy finetuning
	from datasets import Dataset
	dataset = Dataset.from_pandas(train_df)
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

	# sharded model path in hugging face
	# model_name = "TinyPixel/Llama-2-7B-bf16-sharded"


	# model_name = 'NousResearch/Llama-2-7b-hf'

	# Quantization config
	# bnb_config = BitsAndBytesConfig(
	# load_in_4bit=True,
	# bnb_4bit_quant_type="nf4",
	# bnb_4bit_compute_dtype="float16",
	# )
	# take pre quantized model from hugging face
	model_id = "TheBloke/Llama-2-7B-GPTQ"
	# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
	# tokenizer = AutoTokenizer.from_pretrained(model_id)

	# loading the model with quantization config
	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# quantization_config=bnb_config,
	# trust_remote_code=True,
	# device_map='auto'
	# )


	# can change to False if need the newest model update.

	# %%
	from peft import prepare_model_for_kbit_training
	from transformers import GPTQConfig

	# model_id = "TheBloke/Llama-2-7B-GPTQ"
	# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
	quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id, quantization_config=quantization_config_loading, device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model.config.use_cache = False
	model.config.pretraining_tp = 1
	# %%
	model.gradient_checkpointing_enable()
	model = prepare_model_for_kbit_training(model)
	from peft import LoraConfig, get_peft_model
	config = LoraConfig(
	r=8,
	lora_alpha=32,
	target_modules=["k_proj","o_proj","q_proj","v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = get_peft_model(model, config)
	model.print_trainable_parameters()
	# %%
	from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

	# needed for llama 2 tokenizer
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"
	model.config.use_cache = False # silence the warnings. Please re-enable for inference!
	args=TrainingArguments(
	per_device_train_batch_size=4,
	gradient_accumulation_steps=4,
	warmup_steps=2,
	max_steps=100,
	learning_rate=2e-4,
	fp16=True, #use mixed precision training
	logging_steps=1,
	output_dir="outputs_gptq_training",
	optim="adamw_hf",
	save_strategy="epoch",
	report_to="none")

	from trl import SFTTrainer
	trainer = SFTTrainer(
	model=model,
	args=args,
	train_dataset=dataset,
	peft_config=config,
	dataset_text_field="text",
	tokenizer=tokenizer,
	packing=False,
	max_seq_length=512)


	# %%
	train_result = trainer.train()

	# %%
	checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft"
	#to merge and save the model
	output_dir = os.path.join(args.output_dir, checkpoint_name)
	trainer.model.save_pretrained(output_dir)

	# To perform inference on the test dataset
	# this is one way, but loading from dir is better!
	# from peft import PeftModel
	# from rich import print
	# base_model = AutoModelForCausalLM.from_pretrained(
	# model_id,
	# return_dict=True,
	# torch_dtype=torch.float16,
	# device_map='auto'
	# )

	# model = PeftModel.from_pretrained(base_model,
	# model_id=output_dir,
	# device_map='auto',
	# )
	# from peft import LoraConfig, get_peft_model
	# lora_config = LoraConfig.from_pretrained(output_dir)
	# model = get_peft_model(model, lora_config)

	from peft import AutoPeftModelForCausalLM
	# To perform inference on the test dataset example load the model from the checkpoint
	persisted_model = AutoPeftModelForCausalLM.from_pretrained(
	output_dir,
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map="cuda",
	)

	# %%
	#inference on test data example
	from time import perf_counter
	from rich import print
	from transformers import GenerationConfig
	text = test_df['text'][4]
	inputs = tokenizer(text, return_tensors="pt").to('cuda')
	generation_config = GenerationConfig(
	penalty_alpha=0.6,
	do_sample = True,
	top_k=5,
	temperature=0.5,
	repetition_penalty=1.2,
	max_new_tokens=100
	)
	start_time = perf_counter()
	outputs = persisted_model.generate(**inputs, generation_config=generation_config)
	print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	end_time = perf_counter()
	output_time = end_time - start_time
	print(f"Time taken for inference: {round(output_time,2)} seconds")