Skip to content

Instantly share code, notes, and snippets.

@pacman100
Created June 1, 2023 22:26
Show Gist options
  • Star 63 You must be signed in to star a gist
  • Fork 12 You must be signed in to fork a gist
  • Save pacman100/1731b41f7a90a87b457e8c5415ff1c14 to your computer and use it in GitHub Desktop.
Save pacman100/1731b41f7a90a87b457e8c5415ff1c14 to your computer and use it in GitHub Desktop.
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
)
from peft.tuners.lora import LoraLayer
from trl import SFTTrainer
########################################################################
# This is a fully working simple example to use trl's RewardTrainer.
#
# This example fine-tunes any causal language model (GPT-2, GPT-Neo, etc.)
# by using the RewardTrainer from trl, we will leverage PEFT library to finetune
# adapters on the model.
#
########################################################################
# Define and parse arguments.
@dataclass
class ScriptArguments:
"""
These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
"""
local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
per_device_train_batch_size: Optional[int] = field(default=4)
per_device_eval_batch_size: Optional[int] = field(default=1)
gradient_accumulation_steps: Optional[int] = field(default=4)
learning_rate: Optional[float] = field(default=2e-4)
max_grad_norm: Optional[float] = field(default=0.3)
weight_decay: Optional[int] = field(default=0.001)
lora_alpha: Optional[int] = field(default=16)
lora_dropout: Optional[float] = field(default=0.1)
lora_r: Optional[int] = field(default=64)
max_seq_length: Optional[int] = field(default=512)
model_name: Optional[str] = field(
default="tiiuae/falcon-7b",
metadata={
"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
},
)
dataset_name: Optional[str] = field(
default="timdettmers/openassistant-guanaco",
metadata={"help": "The preference dataset to use."},
)
use_4bit: Optional[bool] = field(
default=True,
metadata={"help": "Activate 4bit precision base model loading"},
)
use_nested_quant: Optional[bool] = field(
default=False,
metadata={"help": "Activate nested quantization for 4bit base models"},
)
bnb_4bit_compute_dtype: Optional[str] = field(
default="float16",
metadata={"help": "Compute dtype for 4bit base models"},
)
bnb_4bit_quant_type: Optional[str] = field(
default="nf4",
metadata={"help": "Quantization type fp4 or nf4"},
)
num_train_epochs: Optional[int] = field(
default=1,
metadata={"help": "The number of training epochs for the reward model."},
)
fp16: Optional[bool] = field(
default=False,
metadata={"help": "Enables fp16 training."},
)
bf16: Optional[bool] = field(
default=False,
metadata={"help": "Enables bf16 training."},
)
packing: Optional[bool] = field(
default=False,
metadata={"help": "Use packing dataset creating."},
)
gradient_checkpointing: Optional[bool] = field(
default=True,
metadata={"help": "Enables gradient checkpointing."},
)
optim: Optional[str] = field(
default="paged_adamw_32bit",
metadata={"help": "The optimizer to use."},
)
lr_scheduler_type: str = field(
default="constant",
metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
)
max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"})
warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
group_by_length: bool = field(
default=True,
metadata={
"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."
},
)
save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."})
logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."})
parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses()[0]
def create_and_prepare_model(args):
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=args.use_4bit,
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=args.use_nested_quant,
)
if compute_dtype == torch.float16 and args.use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(
args.model_name, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True
)
peft_config = LoraConfig(
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
r=script_args.lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
], # , "word_embeddings", "lm_head"],
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
return model, peft_config, tokenizer
training_arguments = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=script_args.per_device_train_batch_size,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
optim=script_args.optim,
save_steps=script_args.save_steps,
logging_steps=script_args.logging_steps,
learning_rate=script_args.learning_rate,
fp16=script_args.fp16,
bf16=script_args.bf16,
max_grad_norm=script_args.max_grad_norm,
max_steps=script_args.max_steps,
warmup_ratio=script_args.warmup_ratio,
group_by_length=script_args.group_by_length,
lr_scheduler_type=script_args.lr_scheduler_type,
)
model, peft_config, tokenizer = create_and_prepare_model(script_args)
model.config.use_cache = False
dataset = load_dataset(script_args.dataset_name, split="train")
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=script_args.max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=script_args.packing,
)
for name, module in trainer.model.named_modules():
if isinstance(module, LoraLayer):
if script_args.bf16:
module = module.to(torch.bfloat16)
if "norm" in name:
module = module.to(torch.float32)
if "lm_head" in name or "embed_tokens" in name:
if hasattr(module, "weight"):
if script_args.bf16 and module.weight.dtype == torch.float32:
module = module.to(torch.bfloat16)
trainer.train()
@imrankh46
Copy link

@imrankh46 definitely depends on the gpu and float type etc. for me it's much much less on H100 or A100 (prob around 10 seconds or less if i had to guess but haven't measured)

I am using the following code.

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

peft_model_id = "mymodel"
config = PeftConfig.from_pretrained(peft_model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

@nshalon
Copy link

nshalon commented Jul 10, 2023

@imrankh46 what GPU?

@imrankh46
Copy link

@imrankh46 what GPU?

Colab free GPU

@okoliechykwuka
Copy link

ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-a3d273a8-3e31-4ff4-b37c-9a51367dc60d.json']

I am having the above error.

I installed the required packages using this.

!pip install -q -U git+https://github.com/lvwerra/trl.git git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

@nshalon
Copy link

nshalon commented Jul 10, 2023

out of curiosity, has anyone benchmarked or has a gut feel on the performance of mosaic-instruct 7b / 30b vs falcon-instruct 7b / 40b for any tasks? which is better?

@amnasher
Copy link

@amnasher are you running on cpu or gpu? do you call model.to("cuda") or pass in a device anywhere? my guess is you're running on CPU :)

Yes I do assign DEVICE = "cuda:0" ? How is it running on CPU? I am using colab free.

@anshoomehra
Copy link

@anshoomehra btw - I'm getting OOM while trying to run inference on an 80GB gpu with falcon-40B-instruct (which is args.model_name) based on your code snippet. below is an example, which still is OOM without the PeftModel() call and/or without passing a config into the original model loading. any idea or did i miss something in the code snippet or something? many thanks. i assume the PeftModel() call is totally unnecessary btw - i tried without it as well and still OOM.


print(f"loading the bits and btyes config + PEFT model for {args.model_name} and is_base={args.is_base}")
# turn into peft model with adapters
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

model = model_cls.from_pretrained(
    args.model_name,
    config=config,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

model = PeftModel.from_pretrained(
    model, args.ckpt_dir
)

@nshalon hmm, that's weird you should be able to load it in approx ~40G. I am attaching the full script here & a memory profile screenshot. This should help you enable Peft as well, in the last one I rush-sent a simpler version that did not have all Peft-related code. I do not see Peft reducing memory footprint beyond 4-bits BnB however the trainable parameters would come down further by certain %.

This script may help others too.

Would you mind sharing your training script? Not sure, if you sent it earlier and I missed it? (a lot of messages since then)

** With or Without Peft, ~40G profile:**

image

Code:

import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    Trainer,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)

def create_and_prepare_model(model_name="tiiuae/falcon-40b-instruct", enablePEFT=False):

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=bnb_config,
                trust_remote_code=True,
                device_map="auto"
                )
    
    peft_config = LoraConfig(
        lora_alpha=32,
        lora_dropout=0.05,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "query_key_value",
            "dense",
            "dense_h_to_4h",
            "dense_4h_to_h",
        ],  # , "word_embeddings", "lm_head"],
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    model.gradient_checkpointing_enable()
    
    if enablePEFT:
        model = get_peft_model(model, peft_config)

    return model, tokenizer

_m, _t = create_and_prepare_model("tiiuae/falcon-40b-instruct", True)

@Durbangash
Copy link

hello guys, have you tried exploiting W&B checkpoints, hard to load them no proper guidance on the site and gc flushes all the data after session is closed and i think its easy in tensorboard by loading the latest or whatever checkpoint you gonna use in inference.??

@Durbangash
Copy link

checkout this video on 40b instruct by sentdex on youtube if you are interested in finding out more about the power of 40b!!
https://www.youtube.com/watch?v=-IV1NTGy6Mg

@nshalon
Copy link

nshalon commented Jul 11, 2023

@anshoomehra thanks! that worked... BTW bitsandbytes has some compatibility issues with H100 GPUs, curious if there's another way to load the 40B Falcon on an 80GB GPU...

I'm hesitant to put the training script on public forum since i'm working on some proprietary stuff, but if you email me I can send it to you directly (nitan@shalon.com).

But here's the DeepSpeed ZeRO config that I'm using in case that's helpful.

{
    "bf16": {
      "enabled": true
    },
    "optimizer": {
      "type": "AdamW",
      "params": {
        "lr": "auto",
        "betas": "auto",
        "eps": "auto",
        "weight_decay": "auto"
      }
    },
    "zero_optimization": {
        "stage": 3,
        "contiguous_gradients": true,
        "overlap_comm": true,
        "stage3_gather_16bit_weights_on_model_save": true,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9
    },
    "gradient_accumulation_steps": 344,
    "gradient_clipping": "auto",
    "steps_per_print": 5,
    "train_micro_batch_size_per_gpu": 1,
    "train_batch_size": 1032,
    "wall_clock_breakdown": false,
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
          "total_num_steps": "auto",
          "warmup_min_lr": "auto",
          "warmup_max_lr": "auto",
          "warmup_num_steps": "auto"
        }
    },
    "activation_checkpointing": {
      "partition_activations": false,
      "cpu_checkpointing": false,
      "contiguous_memory_optimization": false,
      "number_checkpoints": null,
      "synchronize_checkpoint_boundary": false,
      "profile": false
    }
}

@brando90
Copy link

has anyone been able to have this script run at all on something not in colab?

I keep getting a matrix mult miss match as detailed here huggingface/peft#685

Error

===================================BUG REPORT===================================
Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
================================================================================
bin /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
mode='disabled'
run=
report_to='none'
{'report_to': 'none', 'path2config': '/lfs/ampere1/0/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweep_configs/debug_config.yaml', 'program': '~/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweeps_common.py', 'project': 'playground', 'entity': 'brando', 'name': 'debug-logging-to-wandb-plataform-test', 'description': 'debug-not-logging-to-wandb-plataform-test', 'metric': {'name': 'train_loss', 'goal': 'minimize'}, 'method': 'random', 'optimizer': 'nadam', 'scheduler': 'cosine', 'lr': 0.0001, 'batch_size': 32, 'num_its': 2, 'run_cap': 1}
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00,  2.00s/it]
Checking for dtype of HF model.
--> Weight 'transformer.word_embeddings.weight' has datatype: torch.float16
Loading cached processed dataset at /lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3631bcc54f6b2e69.arrow
  0%|                                                                                                                           | 0/500 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Traceback (most recent call last):
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1723, in main
    pdb._runscript(mainpyfile)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1583, in _runscript
    self.run(statement)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/bdb.py", line 598, in run
    exec(cmd, globals, locals)
  File "<string>", line 1, in <module>
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 34, in <module>
    main_falcon()
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 21, in main_falcon
    train(args)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/train/sft/qlora_ft.py", line 59, in train_falcon
    trainer.train()
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1645, in train
    return inner_training_loop(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2759, in training_step
    loss = self.compute_loss(model, inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2784, in compute_loss
    outputs = model(**inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 553, in forward
    return model_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 541, in __call__
    return convert_to_fp32(self.model_forward(*args, **kwargs))
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
    return func(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/peft_model.py", line 678, in forward
    return self.base_model(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward
    attn_outputs = self.self_attention(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward
    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/tuners/lora.py", line 565, in forward
    result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)

system info

python collect_env.py
Collecting environment information...
PyTorch version: 2.0.1
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A

OS: Ubuntu 20.04.4 LTS (x86_64)
GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Clang version: Could not collect
CMake version: version 3.26.4
Libc version: glibc-2.31

Python version: 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0] (64-bit runtime)
Python platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31
Is CUDA available: True
CUDA runtime version: 11.7.64
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration:
GPU 0: NVIDIA A100-SXM4-80GB
GPU 1: NVIDIA A100-SXM4-80GB
GPU 2: NVIDIA A100-SXM4-80GB
GPU 3: NVIDIA A100-SXM4-80GB
GPU 4: NVIDIA A100-SXM4-80GB
GPU 5: NVIDIA A100-SXM4-80GB
GPU 6: NVIDIA A100-SXM4-80GB
GPU 7: NVIDIA A100-SXM4-80GB

Nvidia driver version: 515.43.04
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True

CPU:
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   48 bits physical, 48 bits virtual
CPU(s):                          128
On-line CPU(s) list:             0-127
Thread(s) per core:              2
Core(s) per socket:              32
Socket(s):                       2
NUMA node(s):                    2
Vendor ID:                       AuthenticAMD
CPU family:                      25
Model:                           1
Model name:                      AMD EPYC 7543 32-Core Processor
Stepping:                        1
Frequency boost:                 enabled
CPU MHz:                         3455.484
CPU max MHz:                     2800.0000
CPU min MHz:                     1500.0000
BogoMIPS:                        5599.81
Virtualization:                  AMD-V
L1d cache:                       2 MiB
L1i cache:                       2 MiB
L2 cache:                        32 MiB
L3 cache:                        512 MiB
NUMA node0 CPU(s):               0-31,64-95
NUMA node1 CPU(s):               32-63,96-127
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Not affected
Vulnerability Mmio stale data:   Not affected
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling
Vulnerability Srbds:             Not affected
Vulnerability Tsx async abort:   Not affected
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca

Versions of relevant libraries:
[pip3] numpy==1.25.0
[pip3] torch==2.0.1
[pip3] torchaudio==2.0.2
[pip3] torchvision==0.15.2
[pip3] triton==2.0.0
[conda] blas                      1.0                         mkl
[conda] ffmpeg                    4.3                  hf484d3e_0    pytorch
[conda] mkl                       2023.1.0         h6d00ec8_46342
[conda] mkl-service               2.4.0           py310h5eee18b_1
[conda] mkl_fft                   1.3.6           py310h1128e8f_1
[conda] mkl_random                1.2.2           py310h1128e8f_1
[conda] numpy                     1.25.1                   pypi_0    pypi
[conda] numpy-base                1.25.0          py310hb5e798b_0
[conda] pytorch                   2.0.1           py3.10_cuda11.7_cudnn8.5.0_0    pytorch
[conda] pytorch-cuda              11.7                 h778d358_5    pytorch
[conda] pytorch-mutex             1.0                        cuda    pytorch
[conda] torchaudio                2.0.2               py310_cu117    pytorch
[conda] torchtriton               2.0.0                     py310    pytorch
[conda] torchvision               0.15.2              py310_cu117    pytorch
[conda] triton                    2.0.0                    pypi_0    pypi

@nshalon
Copy link

nshalon commented Jul 11, 2023 via email

@brando90
Copy link

@nshalon sorry I don't think I follow.

@lamwilton
Copy link

has anyone been able to have this script run at all on something not in colab?

I keep getting a matrix mult miss match as detailed here huggingface/peft#685

Error

===================================BUG REPORT===================================
Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
================================================================================
bin /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
mode='disabled'
run=
report_to='none'
{'report_to': 'none', 'path2config': '/lfs/ampere1/0/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweep_configs/debug_config.yaml', 'program': '~/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweeps_common.py', 'project': 'playground', 'entity': 'brando', 'name': 'debug-logging-to-wandb-plataform-test', 'description': 'debug-not-logging-to-wandb-plataform-test', 'metric': {'name': 'train_loss', 'goal': 'minimize'}, 'method': 'random', 'optimizer': 'nadam', 'scheduler': 'cosine', 'lr': 0.0001, 'batch_size': 32, 'num_its': 2, 'run_cap': 1}
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00,  2.00s/it]
Checking for dtype of HF model.
--> Weight 'transformer.word_embeddings.weight' has datatype: torch.float16
Loading cached processed dataset at /lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3631bcc54f6b2e69.arrow
  0%|                                                                                                                           | 0/500 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Traceback (most recent call last):
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1723, in main
    pdb._runscript(mainpyfile)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1583, in _runscript
    self.run(statement)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/bdb.py", line 598, in run
    exec(cmd, globals, locals)
  File "<string>", line 1, in <module>
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 34, in <module>
    main_falcon()
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 21, in main_falcon
    train(args)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/train/sft/qlora_ft.py", line 59, in train_falcon
    trainer.train()
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1645, in train
    return inner_training_loop(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2759, in training_step
    loss = self.compute_loss(model, inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2784, in compute_loss
    outputs = model(**inputs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 553, in forward
    return model_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 541, in __call__
    return convert_to_fp32(self.model_forward(*args, **kwargs))
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
    return func(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/peft_model.py", line 678, in forward
    return self.base_model(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward
    attn_outputs = self.self_attention(
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward
    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/tuners/lora.py", line 565, in forward
    result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)

system info

python collect_env.py
Collecting environment information...
PyTorch version: 2.0.1
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A

OS: Ubuntu 20.04.4 LTS (x86_64)
GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Clang version: Could not collect
CMake version: version 3.26.4
Libc version: glibc-2.31

Python version: 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0] (64-bit runtime)
Python platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31
Is CUDA available: True
CUDA runtime version: 11.7.64
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration:
GPU 0: NVIDIA A100-SXM4-80GB
GPU 1: NVIDIA A100-SXM4-80GB
GPU 2: NVIDIA A100-SXM4-80GB
GPU 3: NVIDIA A100-SXM4-80GB
GPU 4: NVIDIA A100-SXM4-80GB
GPU 5: NVIDIA A100-SXM4-80GB
GPU 6: NVIDIA A100-SXM4-80GB
GPU 7: NVIDIA A100-SXM4-80GB

Nvidia driver version: 515.43.04
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True

CPU:
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   48 bits physical, 48 bits virtual
CPU(s):                          128
On-line CPU(s) list:             0-127
Thread(s) per core:              2
Core(s) per socket:              32
Socket(s):                       2
NUMA node(s):                    2
Vendor ID:                       AuthenticAMD
CPU family:                      25
Model:                           1
Model name:                      AMD EPYC 7543 32-Core Processor
Stepping:                        1
Frequency boost:                 enabled
CPU MHz:                         3455.484
CPU max MHz:                     2800.0000
CPU min MHz:                     1500.0000
BogoMIPS:                        5599.81
Virtualization:                  AMD-V
L1d cache:                       2 MiB
L1i cache:                       2 MiB
L2 cache:                        32 MiB
L3 cache:                        512 MiB
NUMA node0 CPU(s):               0-31,64-95
NUMA node1 CPU(s):               32-63,96-127
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Not affected
Vulnerability Mmio stale data:   Not affected
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling
Vulnerability Srbds:             Not affected
Vulnerability Tsx async abort:   Not affected
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca

Versions of relevant libraries:
[pip3] numpy==1.25.0
[pip3] torch==2.0.1
[pip3] torchaudio==2.0.2
[pip3] torchvision==0.15.2
[pip3] triton==2.0.0
[conda] blas                      1.0                         mkl
[conda] ffmpeg                    4.3                  hf484d3e_0    pytorch
[conda] mkl                       2023.1.0         h6d00ec8_46342
[conda] mkl-service               2.4.0           py310h5eee18b_1
[conda] mkl_fft                   1.3.6           py310h1128e8f_1
[conda] mkl_random                1.2.2           py310h1128e8f_1
[conda] numpy                     1.25.1                   pypi_0    pypi
[conda] numpy-base                1.25.0          py310hb5e798b_0
[conda] pytorch                   2.0.1           py3.10_cuda11.7_cudnn8.5.0_0    pytorch
[conda] pytorch-cuda              11.7                 h778d358_5    pytorch
[conda] pytorch-mutex             1.0                        cuda    pytorch
[conda] torchaudio                2.0.2               py310_cu117    pytorch
[conda] torchtriton               2.0.0                     py310    pytorch
[conda] torchvision               0.15.2              py310_cu117    pytorch
[conda] triton                    2.0.0                    pypi_0    pypi

Same here!

Traceback (most recent call last):
  File "falcon_peft.py", line 222, in <module>
    trainer.train()
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 1645, in train
    return inner_training_loop(
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 2759, in training_step
    loss = self.compute_loss(model, inputs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 2784, in compute_loss
    outputs = model(**inputs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
    output.reraise()
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/_utils.py", line 644, in reraise
    raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/peft/peft_model.py", line 678, in forward
    return self.base_model(
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward
    attn_outputs = self.self_attention(
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward
    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/peft/tuners/lora.py", line 565, in forward
    result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)

@ahmedbr
Copy link

ahmedbr commented Jul 13, 2023

@anshoomehra will share when i get to it this weekend. how did you get the 40B (base or instruct) to do inference with 36GB? i'm seeing OOM with 80GB and standard eval mode loading. thanks.

Set load_in_8bit=True

This didn't work for me while running eval. any other suggestions?

@anshoomehra
Copy link

anshoomehra commented Jul 13, 2023

Same weird error, even with the base script & their dataset. Anyone has a solution to this?

RuntimeError: mat1 and mat2 shapes cannot be multiplied

@alexanderfrey
Copy link

same here:
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)

@alexanderfrey
Copy link

Ok its fixed. You have to upgrade peft to 0.4.0.dev0

@alexanderfrey
Copy link

alexanderfrey commented Jul 14, 2023

Just curious - does anyone specify the eval_dataset for SFTTrainer ?

@anshoomehra
Copy link

@alexanderfrey

  1. Eval set, you should be able to run without or just to test things up can even carve out a certain % from the training set.

  2. Tried upgrading to peft 0.4.0.dev0 old error is gone but the new one surfaced :-)

RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the forward function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple checkpoint functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 255 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.

@anshoomehra
Copy link

Does the above code work with multiple GPUs?

You will need accelerate or deepspeed.

@xzrderek
Copy link

xzrderek commented Jul 14, 2023

I'm not familiar with accelerate, could you tell me more about how I would implement it? Also how can I use my own dataset (i.e. if it is a JSON file) with the code above?

@vejvarm
Copy link

vejvarm commented Jul 17, 2023

@okoliechykwuka

ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-a3d273a8-3e31-4ff4-b37c-9a51367dc60d.json']

I am having the above error.

I installed the required packages using this.

!pip install -q -U git+https://github.com/lvwerra/trl.git git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

I think the problem is with jupyter adding the -f argument to sys.argv automatically when you run a cell.
You can fix it by parsing args=[] to the parser.parse_args_into_dataclasses, which will override the command line arguments.
i.e. try changing this line:
script_args = parser.parse_args_into_dataclasses(args=[])

@Einengutenmorgen
Copy link

How do I continue from a specific checkpoint to fine-tune with different data?
I tried to use the same script as for training,

def create_and_prepare_model():
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '[EOS]'})
model = AutoModelForCausalLM.from_pretrained( "tiiuae/falcon-7b", quantization_config=bnb_config, device_map="auto", trust_remote_code=True)


model = PeftModel.from_pretrained(model, <Path to adapter_config.json>)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value"
    ],
)
model.resize_token_embeddings(len(tokenizer))
return model, peft_config, tokenizer

ERROR:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/peft/utils/config.py", line 177, in _get_peft_type
config_file = hf_hub_download(
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn
validate_repo_id(arg_value)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 158, in validate_repo_id
raise HFValidationError(
huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ec2-user/SageMaker/Models/Model_10_multinews/checkpoint-13000/'. Use repo_type argument if needed.

@lamwilton
Copy link

After upgrading all packages I am getting this error also

File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/378337427557d1df3e742264a2901a49f25d4eb1/modelling_RW.py", line 93, in forward
return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
TypeError: unsupported operand type(s) for : 'Tensor' and 'NoneType'*

@xzrderek
Copy link

xzrderek commented Jul 20, 2023

My code runs but I am getting a spiky loss chart that looks like this:

image

Does anyone know why this is happening/what hyperparameters I should be tuning? I am running this command, mentioned by @pacman100:

python train.py \
--model_name tiiuae/falcon-40b \
--max_seq_len 2048 \
--bf16 \
--group_by_length \
--bnb_4bit_compute_dtype bfloat16 \
--max_steps 200

EDIT: seems like trl is causing this issue, forcing a pip install to trl==0.4.6 fixes it.

@mtisz
Copy link

mtisz commented Jul 25, 2023

Any of you running into issues with models generating too much text? Like, my models aren't stopping the generation, it doesn't know when to stop.. Does line 173 tokenizer.pad_token = tokenizer.eos_token actually help? Or do I need to append the eos_token at the end of each training line?

{"text": "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nUSER: Read the following paragraph and extract the answer for the question: What is the last name of the person who composed seven operas in three years at Brunswick? Handel joined the Hamburg opera house when it was experiencing a period of considerable artistic success. This blossoming followed the arrival of Reinhard Keiser, who had become musical director at the G\u00e4nsemarkt in about 1697, and in 1703 succeeded Johann Kusser as the theatre's manager. Born in 1674, Keiser had studied under Johann Schelle and probably Johann Kuhnau at the Thomasschule zu Leipzig. In 1694 he was employed as a court composer at Brunswick, where in three years he composed seven operas, at least one of which (Mahumeth) was performed in Hamburg. According to Handel's biographer Donald Burrows, Keiser was a good judge of popular taste, with a flair for writing Italian-style arias. Between 1697 and 1703, prior to Handel's arrival, about a dozen more Keiser operas had been staged at the G\u00e4nsemarkt. Despite his on-stage successes, Keiser was an unreliable general manager, with expensive private tastes and little financial acumen, often at odds with his creditors.It is possible that Keiser, who had connections in the Halle area, had heard of Handel and was directly instrumental in securing the latter's post in the G\u00e4nsemarkt orchestra; certainly he was a considerable influence on the younger man in the three years that Handel spent in Hamburg. Another important G\u00e4nsemarkt colleague was the house composer and singer Johann Mattheson, who noted Handel's rapid progress in the orchestra from back-desk violinist to harpsichord soloist, a role in which, said Mattheson, \"he showed himself a man\u2014a thing which no one had before suspected, save I alone\". Mattheson was less complimentary on Handel's early efforts at composition: \"He composed very long, long arias, and really interminable cantatas\", before, it seems, \"the lofty schooling of opera ... trimmed him into other fashions\":\nASSISTANT: Keiser\n"}

Do I need to append this with </s> (for LLaMA)? Or do I need to append the line with "USER:" so the model adds that term at the end of each generated response, so I can strip the text before it?

@xzrderek
Copy link

does anyone know how can I load model from checkpoint to continue training? I've trained it with my dataset, now I want to pick it up from the checkpoint and continue the training but using the guanaco dataset on top. Is it possible?

@rogeriochaves did you ever figure out if this was possible?

@DanM3rcurius
Copy link

GM
I'm running into memory issues when training. Not sure if this is due to my graphics card not being powerful enough?
if someone could point a noob like me to what I'm doing wrong, that would be much appreciated.

OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB. GPU 0 has a total capacty of 5.80 GiB of which 47.69 MiB is free. Including non-PyTorch memory, this process has 5.74 GiB memory in use. Of the allocated memory 5.55 GiB is allocated by PyTorch, and 54.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 41C P8 10W / 80W | 5890MiB / 6144MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1348 G /usr/lib/Xorg 4MiB |
| 0 N/A N/A 3446 C /home/mercurius/LLMs/llmenv/bin/python 5876MiB |
+---------------------------------------------------------------------------------------+

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment