Skip to content

Instantly share code, notes, and snippets.

@cmosguy
Last active September 22, 2023 14:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cmosguy/4b5052507191200397c551fc24853ef0 to your computer and use it in GitHub Desktop.
Save cmosguy/4b5052507191200397c551fc24853ef0 to your computer and use it in GitHub Desktop.
debugging a codellama finetuning
# %%
from datetime import datetime
import os
import sys
import gpustat
gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]
import torch
from peft import (
LoraConfig,
get_peft_model,
get_peft_model_state_dict,
prepare_model_for_int8_training,
set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
dataset = load_dataset("b-mc2/sql-create-context", split="train")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]
# %%
base_model_name = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
load_in_8bit=True,
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# %%
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.
You must output the SQL query that answers the question.
### Input:
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?
### Context:
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)
### Response:
"""
# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))
# %% [markdown]
# I get the output:
# ```
# SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska'
# ```
# which is clearly wrong if the input is asking for just class!
# %% [markdown]
# ### 4. Tokenization
# Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
# %% [markdown]
# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:
# %%
def tokenize(prompt):
result = tokenizer(
prompt,
truncation=True,
max_length=512,
padding=False,
return_tensors=None,
)
# "self-supervised learning" means the labels are also the inputs:
result["labels"] = result["input_ids"].copy()
return result
# %% [markdown]
# And run convert each data_point into a prompt that I found online that works quite well:
def generate_and_tokenize_prompt(data_point):
full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.
You must output the SQL query that answers the question.
### Input:
{data_point["question"]}
### Context:
{data_point["context"]}
### Response:
{data_point["answer"]}
"""
return tokenize(full_prompt)
# %% [markdown]
# Reformat to prompt and tokenize each sample:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
# %% [markdown]
# ### 5. Setup Lora
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
# %% [markdown]
# To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:
resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from
if resume_from_checkpoint:
if os.path.exists(resume_from_checkpoint):
print(f"Restarting from {resume_from_checkpoint}")
adapters_weights = torch.load(resume_from_checkpoint)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {resume_from_checkpoint} not found")
# %%
if torch.cuda.device_count() > 1:
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
print("total devices: ", torch.cuda.device_count())
model.is_parallelizable = True
model.model_parallel = True
else:
print("only 1 gpu available")
# %% [markdown]
# ### 6. Training arguments
# If you run out of GPU memory, change per_device_train_batch_size.
# The gradient_accumulation_steps variable should ensure this
# doesn't affect batch dynamics during the training run.
# All the other variables are standard stuff that I wouldn't
# recommend messing with:
batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "../checkpoints"
training_args = TrainingArguments(
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=100,
max_steps=400,
learning_rate=3e-4,
fp16=True,
bf16=False, # you can only set true if fp16 is false, you cannot have both
logging_steps=10,
optim="adamw_torch",
evaluation_strategy="steps", # if val_set_size > 0 else "no",
save_strategy="steps",
eval_steps=20,
save_steps=20,
output_dir=output_dir,
# save_total_limit=3,
load_best_model_at_end=False,
# ddp_find_unused_parameters=False if ddp else None,
group_by_length=True, # group sequences of roughly the same length together to speed up training
report_to="tensorboard", # if use_wandb else "none",
run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
logging_dir=f"../logs/runs",
)
trainer = Trainer(
model=model,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
args=training_args,
data_collator=DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
# %% [markdown]
# Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
print("compiling the model")
model = torch.compile(model)
# %%
gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]
trainer.train()
# Save trained model
new_model = "../models/sql-code-llama"
trainer.model.save_pretrained(new_model)
gpus = gpustat.new_query()
[print(gpu) for gpu in gpus]
# %% [markdown]
# ### Load the final checkpoint
# Now for the moment of truth! Has our work paid off...?
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel
base_model_name = "codellama/CodeLlama-7b-hf"
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
load_in_8bit=True,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model)
#model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
# %% [markdown]
# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained.
# ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:
# the run took forever so all I have is this checkpoint:
# output_dir = "./sql-code-llama/checkpoint-360"
# %% [markdown]
# Try the same prompt as before:
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.
You must output the SQL query that answers the question.
### Input:
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?
### Context:
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)
### Response:
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))
# %% [markdown]
# And the model outputs:
# ```
# SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska"
# ```
# So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_).
#
git+https://github.com/huggingface/transformers.git@main
bitsandbytes # we need latest transformers for this
git+https://github.com/huggingface/peft.git@4c611f4
datasets==2.10.1
wandb
scipy
gpustat
pytz
tensorboardX
@cmosguy
Copy link
Author

cmosguy commented Sep 21, 2023

@samlhuillier

Sorry to keep bothering you about this, but still am having issues

I just tried running this code and using the versions:

transformers @ git+https://github.com/huggingface/transformers.git@17fdd35481e6b462989c1c600e6cc0987dc88621
peft @ git+https://github.com/huggingface/peft.git@4c611f40b4c605c0cf8cd3e38b5c462341025fd4

and I still cannot get it to save the updated weights with the lora adaptors. I did the check using the code:

import os
def check_binary_files_identical(file_path1, file_path2):
    # Check if the size of both files is the same
    if os.path.getsize(file_path1) != os.path.getsize(file_path2):
        return False
    
    # Open both files and compare their contents
    with open(file_path1, 'rb') as file1, open(file_path2, 'rb') as file2:
        while True:
            chunk1 = file1.read(8192) # Reading in chunks of 8192 bytes
            chunk2 = file2.read(8192)

            # If the chunks are different, return False
            if chunk1 != chunk2:
                return False

            # If both chunks are empty, it means we have reached the end of both files and they are identical
            if not chunk1:
                return True

file_path1 = "../checkpoints/checkpoint-300/adapter_model.bin"
file_path2 = "../checkpoints/checkpoint-400/adapter_model.bin"

result = check_binary_files_identical(file_path1, file_path2)
print("Are the files identical?", result)

Are you getting the same results?

@samlhuillier
Copy link

Hey try these:
!pip install git+https://github.com/huggingface/transformers.git@main bitsandbytes accelerate==0.20.3 # we need latest transformers for this
!pip install git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
!pip install datasets==2.10.1
import locale # colab workaround
locale.getpreferredencoding = lambda: "UTF-8" # colab workaround
!pip install wandb
!pip install scipy

I suspect they may work...

@cmosguy
Copy link
Author

cmosguy commented Sep 22, 2023

Thanks @samlhuillier !

The issue is if you install everything using the requirements file: file-requirements-debug.txt attached to this gist the patching of the peft does not work. It has to be installed outside the requirements file for it to really work as it should. very weird...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment