cmosguy/03-ragntune-fine-tune-code-llama.py

## 03-ragntune-fine-tune-code-llama.py
# %%
from datetime import datetime
import os
import sys
import gpustat

gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]


import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


from datasets import load_dataset
dataset = load_dataset("b-mc2/sql-create-context", split="train")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

# %%
base_model_name = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)


# %%
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.
### Input:
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

### Context:
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

### Response:
"""
# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

# %% [markdown]
# I get the output:
# ```
# SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska'
# ```
# which is clearly wrong if the input is asking for just class!

# %% [markdown]
# ### 4. Tokenization
# Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# %% [markdown]
# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

# %%
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

# %% [markdown]
# And run convert each data_point into a prompt that I found online that works quite well:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.

### Input:
{data_point["question"]}

### Context:
{data_point["context"]}

### Response:
{data_point["answer"]}
"""
    return tokenize(full_prompt)

# %% [markdown]
# Reformat to prompt and tokenize each sample:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

# %% [markdown]
# ### 5. Setup Lora
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# %% [markdown]
# To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:
resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

# %%
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    print("total devices: ", torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True
else:
    print("only 1 gpu available")

# %% [markdown]
# ### 6. Training arguments
# If you run out of GPU memory, change per_device_train_batch_size.
# The gradient_accumulation_steps variable should ensure this
# doesn't affect batch dynamics during the training run.
# All the other variables are standard stuff that I wouldn't
# recommend messing with:
batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "../checkpoints"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        bf16=False, # you can only set true if fp16 is false, you cannot have both
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        # save_total_limit=3,
        load_best_model_at_end=False,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="tensorboard", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
        logging_dir=f"../logs/runs",
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

# %% [markdown]
# Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

# %%
gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]

trainer.train()

# Save trained model
new_model = "../models/sql-code-llama"
trainer.model.save_pretrained(new_model)

gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]
# %% [markdown]
# ### Load the final checkpoint
# Now for the moment of truth! Has our work paid off...?
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel

base_model_name = "codellama/CodeLlama-7b-hf"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(base_model, new_model)
#model = model.merge_and_unload()


tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)


# %% [markdown]
# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained.
# ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:
# the run took forever so all I have is this checkpoint:
# output_dir = "./sql-code-llama/checkpoint-360"

# %% [markdown]
# Try the same prompt as before:
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.
### Input:
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

### Context:
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


# %% [markdown]
# And the model outputs:
# ```
# SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska"
# ```
# So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_).
#


## requirements-debug.txt
git+https://github.com/huggingface/transformers.git@main
bitsandbytes  # we need latest transformers for this
git+https://github.com/huggingface/peft.git@4c611f4
datasets==2.10.1
wandb
scipy
gpustat
pytz
tensorboardX
	# %%
	from datetime import datetime
	import os
	import sys
	import gpustat

	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]


	import torch
	from peft import (
	LoraConfig,
	get_peft_model,
	get_peft_model_state_dict,
	prepare_model_for_int8_training,
	set_peft_model_state_dict,
	)
	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


	from datasets import load_dataset
	dataset = load_dataset("b-mc2/sql-create-context", split="train")
	train_dataset = dataset.train_test_split(test_size=0.1)["train"]
	eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

	# %%
	base_model_name = "codellama/CodeLlama-7b-hf"
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	load_in_8bit=True,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)


	# %%
	eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.
	### Input:
	Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

	### Context:
	CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

	### Response:
	"""
	# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
	model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

	model.eval()
	with torch.no_grad():
	print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

	# %% [markdown]
	# I get the output:
	# ```
	# SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska'
	# ```
	# which is clearly wrong if the input is asking for just class!

	# %% [markdown]
	# ### 4. Tokenization
	# Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):
	tokenizer.add_eos_token = True
	tokenizer.pad_token_id = 0
	tokenizer.padding_side = "left"

	# %% [markdown]
	# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

	# %%
	def tokenize(prompt):
	result = tokenizer(
	prompt,
	truncation=True,
	max_length=512,
	padding=False,
	return_tensors=None,
	)

	# "self-supervised learning" means the labels are also the inputs:
	result["labels"] = result["input_ids"].copy()

	return result

	# %% [markdown]
	# And run convert each data_point into a prompt that I found online that works quite well:
	def generate_and_tokenize_prompt(data_point):
	full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.

	### Input:
	{data_point["question"]}

	### Context:
	{data_point["context"]}

	### Response:
	{data_point["answer"]}
	"""
	return tokenize(full_prompt)

	# %% [markdown]
	# Reformat to prompt and tokenize each sample:
	tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
	tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

	# %% [markdown]
	# ### 5. Setup Lora
	model.train() # put model back into training mode
	model = prepare_model_for_int8_training(model)

	config = LoraConfig(
	r=16,
	lora_alpha=16,
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)
	model = get_peft_model(model, config)

	# %% [markdown]
	# To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:
	resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from

	if resume_from_checkpoint:
	if os.path.exists(resume_from_checkpoint):
	print(f"Restarting from {resume_from_checkpoint}")
	adapters_weights = torch.load(resume_from_checkpoint)
	set_peft_model_state_dict(model, adapters_weights)
	else:
	print(f"Checkpoint {resume_from_checkpoint} not found")

	# %%
	if torch.cuda.device_count() > 1:
	# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
	print("total devices: ", torch.cuda.device_count())
	model.is_parallelizable = True
	model.model_parallel = True
	else:
	print("only 1 gpu available")

	# %% [markdown]
	# ### 6. Training arguments
	# If you run out of GPU memory, change per_device_train_batch_size.
	# The gradient_accumulation_steps variable should ensure this
	# doesn't affect batch dynamics during the training run.
	# All the other variables are standard stuff that I wouldn't
	# recommend messing with:
	batch_size = 128
	per_device_train_batch_size = 32
	gradient_accumulation_steps = batch_size // per_device_train_batch_size
	output_dir = "../checkpoints"

	training_args = TrainingArguments(
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	warmup_steps=100,
	max_steps=400,
	learning_rate=3e-4,
	fp16=True,
	bf16=False, # you can only set true if fp16 is false, you cannot have both
	logging_steps=10,
	optim="adamw_torch",
	evaluation_strategy="steps", # if val_set_size > 0 else "no",
	save_strategy="steps",
	eval_steps=20,
	save_steps=20,
	output_dir=output_dir,
	# save_total_limit=3,
	load_best_model_at_end=False,
	# ddp_find_unused_parameters=False if ddp else None,
	group_by_length=True, # group sequences of roughly the same length together to speed up training
	report_to="tensorboard", # if use_wandb else "none",
	run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
	logging_dir=f"../logs/runs",
	)

	trainer = Trainer(
	model=model,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_val_dataset,
	args=training_args,
	data_collator=DataCollatorForSeq2Seq(
	tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
	),
	)

	# %% [markdown]
	# Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):
	model.config.use_cache = False

	old_state_dict = model.state_dict
	model.state_dict = (lambda self, _, *__: get_peft_model_state_dict(self, old_state_dict())).__get__(
	model, type(model)
	)
	if torch.__version__ >= "2" and sys.platform != "win32":
	print("compiling the model")
	model = torch.compile(model)

	# %%
	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]

	trainer.train()

	# Save trained model
	new_model = "../models/sql-code-llama"
	trainer.model.save_pretrained(new_model)

	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]
	# %% [markdown]
	# ### Load the final checkpoint
	# Now for the moment of truth! Has our work paid off...?
	import torch
	from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
	from peft import PeftModel

	base_model_name = "codellama/CodeLlama-7b-hf"
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	load_in_8bit=True,
	torch_dtype=torch.float16,
	device_map="auto",
	)

	model = PeftModel.from_pretrained(base_model, new_model)
	#model = model.merge_and_unload()


	tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)


	# %% [markdown]
	# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained.
	# ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:
	# the run took forever so all I have is this checkpoint:
	# output_dir = "./sql-code-llama/checkpoint-360"

	# %% [markdown]
	# Try the same prompt as before:
	eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.
	### Input:
	Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

	### Context:
	CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

	### Response:
	"""

	model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

	model.eval()
	with torch.no_grad():
	print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


	# %% [markdown]
	# And the model outputs:
	# ```
	# SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska"
	# ```
	# So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_).
	#
	git+https://github.com/huggingface/transformers.git@main
	bitsandbytes # we need latest transformers for this
	git+https://github.com/huggingface/peft.git@4c611f4
	datasets==2.10.1
	wandb
	scipy
	gpustat
	pytz
	tensorboardX