Skip to content

Instantly share code, notes, and snippets.

@KeremTurgutlu
Last active January 10, 2024 14:29
Show Gist options
  • Save KeremTurgutlu/2bdc3e9591b527a96b7ad5f9ab25dc82 to your computer and use it in GitHub Desktop.
Save KeremTurgutlu/2bdc3e9591b527a96b7ad5f9ab25dc82 to your computer and use it in GitHub Desktop.
QLORA Memory Experiments
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d30779b0-0df2-445a-829d-fc3b243c462c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/cuda_setup/main.py:107: UserWarning: \n",
"\n",
"================================================================================\n",
"WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n",
"BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n",
"If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n",
"If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n",
"For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
"Loading CUDA version: BNB_CUDA_VERSION=123\n",
"================================================================================\n",
"\n",
"\n",
" warn((f'\\n\\n{\"=\"*80}\\n'\n",
"/home/paperspace/miniconda3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import bitsandbytes as bnb\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"from transformers import AutoModelForCausalLM\n",
"from transformers.utils.quantization_config import BitsAndBytesConfig\n",
"from transformers.pytorch_utils import Conv1D\n",
"\n",
"import transformers\n",
"from transformers import LlamaConfig, LlamaForCausalLM\n",
"from transformers.integrations.bitsandbytes import replace_with_bnb_linear\n",
"from transformers.utils.quantization_config import BitsAndBytesConfig\n",
"from transformers.models.llama.modeling_llama import LlamaDecoderLayer\n",
"\n",
"from peft.tuners.lora.config import LoraConfig\n",
"from peft.mapping import get_peft_model\n",
"from peft.utils.peft_types import *\n",
"\n",
"import gc\n",
"import inspect\n",
"from accelerate.utils import set_seed"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b8a001c3-4941-44dc-97b0-dd9f67c5148a",
"metadata": {},
"outputs": [],
"source": [
"transformers.logging.set_verbosity_warning()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8bda461b-c894-4c8b-8d43-a3023a9570bb",
"metadata": {},
"outputs": [],
"source": [
"def malloc_in_gb():\n",
" return torch.cuda.memory_allocated()/1e9"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "18e63dde-9528-4315-88df-4c7bea0db6ac",
"metadata": {},
"outputs": [],
"source": [
"def get_model_size_config(model_size):\n",
" if model_size == \"DEBUG\":\n",
" model_size_config = dict(hidden_size=128,\n",
" num_hidden_layers=2,\n",
" num_attention_heads=2,\n",
" num_key_value_heads=2,\n",
" intermediate_size=256)\n",
" elif model_size == \"60M\":\n",
" model_size_config = dict(hidden_size=512,\n",
" num_hidden_layers=4,\n",
" num_attention_heads=4,\n",
" num_key_value_heads=4,\n",
" intermediate_size=1024)\n",
" elif model_size == \"120M\":\n",
" model_size_config = dict(hidden_size=768,\n",
" num_hidden_layers=12,\n",
" num_attention_heads=12,\n",
" num_key_value_heads=12,\n",
" intermediate_size=1536)\n",
" elif model_size == \"290M\":\n",
" model_size_config = dict(hidden_size=1024,\n",
" num_hidden_layers=12,\n",
" num_attention_heads=16,\n",
" num_key_value_heads=16,\n",
" intermediate_size=4096)\n",
" elif model_size == \"1B\":\n",
" model_size_config = dict(hidden_size=2048,\n",
" num_hidden_layers=24,\n",
" num_attention_heads=16,\n",
" num_key_value_heads=16,\n",
" intermediate_size=4096)\n",
" elif model_size == \"7B\":\n",
" model_size_config = {}\n",
" return model_size_config"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8bae5ba6-f4cb-44a7-9191-89bab9e930f5",
"metadata": {},
"outputs": [],
"source": [
"def create_model(model_size=\"1B\"):\n",
" model_size_config = get_model_size_config(model_size)\n",
" # download model weights and config files.\n",
" config = LlamaConfig()\n",
" config.update(model_size_config)\n",
" model = LlamaForCausalLM(config)\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a04c9743-43b7-451f-90d9-ff7a3201f4e3",
"metadata": {},
"outputs": [],
"source": [
"def free_memory():\n",
" gc.collect()\n",
" torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "0af5bfb5-5d40-4bcc-a3f5-4c5520f5398b",
"metadata": {},
"outputs": [],
"source": [
"set_seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7a3990ca-6cd7-47de-813c-442802520487",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.000 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8fcce7e1-f68e-497a-bb61-fcf7d0780717",
"metadata": {},
"outputs": [],
"source": [
"model = create_model(\"DEBUG\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4995b7d8-9f80-4752-8224-7a74cfa22f76",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(torch.float32, device(type='cpu'))"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab_size = model.model.embed_tokens.weight.size(0)\n",
"model.dtype, model.device"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d65fc367-2907-422b-90a8-547f4f3c5bb7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.000 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "b90ed1a0-67f1-4c70-8b49-7b5ece6b96a2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 2.311 GB\n"
]
}
],
"source": [
"model.to(\"cuda\", torch.bfloat16);\n",
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "96be6449-a666-4168-b3df-c9f48968973b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"model = None\n",
"free_memory()\n",
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "daffded3-7536-453c-86ac-877f677e955e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 2.311 GB\n"
]
}
],
"source": [
"model = create_model()\n",
"model.to(\"cuda\", torch.bfloat16);\n",
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "72909c76-0a7d-406b-9c4f-e848e866b92a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"model = None\n",
"free_memory()\n",
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "51c5f80a-5745-4cd3-8ee9-5df7f3572062",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.018 GB\n"
]
}
],
"source": [
"# assume packed sequences where max_seqlen = sl\n",
"inputs = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,4096]]\n",
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "5b8d61f6-4d49-4689-8e8e-b4d044d03720",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated (model): 2.311 GB\n",
"Memory allocated torch.Size([1, 512]): 3.605 GB\n",
"Memory allocated (model): 2.311 GB\n",
"Memory allocated torch.Size([1, 1024]): 4.899 GB\n",
"Memory allocated (model): 2.311 GB\n",
"Memory allocated torch.Size([1, 2048]): 7.484 GB\n",
"Memory allocated (model): 2.311 GB\n",
"Memory allocated torch.Size([1, 4096]): 12.683 GB\n"
]
}
],
"source": [
"for x in inputs:\n",
" model = create_model()\n",
" model.to(\"cuda\", torch.bfloat16);\n",
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
" output = model(x.to(\"cuda\"))\n",
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
" output, model = None, None\n",
" free_memory()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "cbc77ccd-05bf-417e-bab5-11e9370e2b7c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "markdown",
"id": "48fb26df-a649-43da-bd14-f095e9913ab4",
"metadata": {},
"source": [
"### LoRA"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4d24d2f1-25bb-432c-be9f-401bd0ff561f",
"metadata": {},
"outputs": [],
"source": [
"def create_lora_model(model_size=\"1B\"):\n",
" model_size_config = get_model_size_config(model_size)\n",
" # download model weights and config files.\n",
" config = LlamaConfig()\n",
" config.update(model_size_config)\n",
" model = LlamaForCausalLM(config)\n",
" peft_config = LoraConfig(\n",
" task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
" )\n",
" model = get_peft_model(model, peft_config)\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "6b8fb52e-a2b2-4774-9ec4-28642d86dbd9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "45b7f215-31e6-4470-9e16-59ffdbc437f1",
"metadata": {},
"outputs": [],
"source": [
"lora_model = create_lora_model()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "c1eab94a-e0ef-4b90-81ce-75dfeadd289d",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"for n, p in lora_model.named_parameters():\n",
" if p.device.type == \"meta\":\n",
" print(n, p.requires_grad, p.device)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "38806e17-b7ec-4e29-9d26-a5901cac3d32",
"metadata": {},
"outputs": [],
"source": [
"trainable_params, untrainable_params = [],[] \n",
"for n, p in lora_model.named_parameters():\n",
" if p.requires_grad: trainable_params.append(n)\n",
" if not p.requires_grad: untrainable_params.append(n)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "71c2bc73-69d9-4b30-a249-0f79a1781240",
"metadata": {},
"outputs": [],
"source": [
"# trainable_params"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "3d793005-bdc6-4739-a423-247fcdfe3be8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "9347c1ce-8ec4-4adf-8cf7-ead79664c75d",
"metadata": {},
"outputs": [],
"source": [
"lora_model = None"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "7dbcd854-77a7-4917-9de5-4c1c44797e2c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated (model): 2.315 GB\n",
"Memory allocated torch.Size([1, 512]): 3.443 GB\n",
"Memory allocated (model): 2.315 GB\n",
"Memory allocated torch.Size([1, 1024]): 4.572 GB\n",
"Memory allocated (model): 2.315 GB\n",
"Memory allocated torch.Size([1, 2048]): 6.826 GB\n",
"Memory allocated (model): 2.315 GB\n",
"Memory allocated torch.Size([1, 4096]): 11.364 GB\n"
]
}
],
"source": [
"for x in inputs:\n",
" lora_model = create_lora_model()\n",
" lora_model.to(\"cuda\", torch.bfloat16);\n",
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
" output = lora_model(x.to(\"cuda\"))\n",
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
" output, lora_model = None, None\n",
" free_memory()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "41e23d9b-288f-4974-9e7e-71d87e14995a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "markdown",
"id": "31e01883-2551-4784-93a3-e9ca651feb36",
"metadata": {},
"source": [
"### QLoRA"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "fe1306d6-051a-42cb-a313-21c4849d70aa",
"metadata": {},
"outputs": [],
"source": [
"def replace_with_bnb_4bit_linear(\n",
" model,\n",
" modules_to_not_convert=None,\n",
" current_key_name=None,\n",
" quantization_config=None,\n",
" has_been_replaced=False,\n",
" quant_storage=torch.uint8\n",
"):\n",
" \"\"\"\n",
" Private method that wraps the recursion for module replacement.\n",
"\n",
" Returns the converted model and a boolean that indicates if the conversion has been successfull or not.\n",
" \"\"\"\n",
" for name, module in model.named_children():\n",
" if current_key_name is None:\n",
" current_key_name = []\n",
" current_key_name.append(name)\n",
"\n",
" if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:\n",
" # Check if the current key is not in the `modules_to_not_convert`\n",
" if not any(key in \".\".join(current_key_name) for key in modules_to_not_convert):\n",
" # with init_empty_weights():\n",
" if isinstance(module, Conv1D):\n",
" in_features, out_features = module.weight.shape\n",
" else:\n",
" in_features = module.in_features\n",
" out_features = module.out_features\n",
"\n",
" model._modules[name] = bnb.nn.Linear4bit(\n",
" in_features,\n",
" out_features,\n",
" module.bias is not None,\n",
" quantization_config.bnb_4bit_compute_dtype,\n",
" compress_statistics=quantization_config.bnb_4bit_use_double_quant,\n",
" quant_type=quantization_config.bnb_4bit_quant_type,\n",
" quant_storage=quant_storage\n",
" )\n",
" has_been_replaced = True\n",
" # Store the module class in case we need to transpose the weight later\n",
" model._modules[name].source_cls = type(module)\n",
" # Force requires grad to False to avoid unexpected errors\n",
" model._modules[name].requires_grad_(False)\n",
" if len(list(module.children())) > 0:\n",
" _, has_been_replaced = replace_with_bnb_4bit_linear(\n",
" module,\n",
" modules_to_not_convert,\n",
" current_key_name,\n",
" quantization_config,\n",
" has_been_replaced=has_been_replaced,\n",
" )\n",
" # Remove the last key for recursion\n",
" current_key_name.pop(-1)\n",
" return model, has_been_replaced"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b5f71873-662f-40e8-be83-cc2ef63cd561",
"metadata": {},
"outputs": [],
"source": [
"def create_qlora_model(model_size=\"1B\", with_lora=False):\n",
" \n",
" model_size_config = get_model_size_config(model_size)\n",
" \n",
" # download model weights and config files.\n",
" config = LlamaConfig()\n",
" config.update(model_size_config)\n",
" model = LlamaForCausalLM(config)\n",
" qconfig = BitsAndBytesConfig(load_in_4bit=True, \n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_use_double_quant=False,\n",
" bnb_4bit_compute_dtype=torch.bfloat16)\n",
" model, has_been_replaced = replace_with_bnb_4bit_linear(model, modules_to_not_convert=[\"lm_head\"], quantization_config=qconfig)\n",
" assert has_been_replaced\n",
" if with_lora:\n",
" peft_config = LoraConfig(\n",
" task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
" )\n",
" model = get_peft_model(model, peft_config)\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "fa4464d7-b904-49d8-bbb4-afa3e4e08d91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.000 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7751158a-b150-4b7e-859b-c2fe1f951450",
"metadata": {},
"outputs": [],
"source": [
"# qlora_model = create_qlora_model()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "062f1d49-97f8-4808-9b89-557fa0c31657",
"metadata": {},
"outputs": [],
"source": [
"# trainable_params, untrainable_params = [],[] \n",
"# for n, p in qlora_model.named_parameters():\n",
"# if p.requires_grad: trainable_params.append(n)\n",
"# if not p.requires_grad: untrainable_params.append(n)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0fa11aa3-6db3-4328-bfb6-ba88be877e16",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# for n, p in qlora_model.named_parameters():\n",
"# print(n, p.device)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0ded9a20-2774-48a6-82f2-8c139eca5aec",
"metadata": {},
"outputs": [],
"source": [
"# print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ba3790ce-e7a5-4141-b01b-6b490d616dd8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# for n, p in qlora_model.named_parameters():\n",
"# if p.device.type == \"meta\":\n",
"# print(n, p.requires_grad, p.device)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5a761ec8-6010-4367-b2ce-fa02f41a43c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated (model): 0.856 GB\n",
"> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(568)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m 566 \u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mquant_state\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 567 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m--> 568 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequires_grad\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 569 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 570 \u001b[0;31m \u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"ipdb> A.shape, A.dtype\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(torch.Size([1, 512, 2048]), torch.bfloat16)\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"ipdb> B.shape, B.dtype\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(torch.Size([1, 2097152]), torch.uint8)\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"ipdb> A.numel() == A.shape[-1] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"ipdb> n\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(578)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m 574 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 575 \u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 576 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m 577 \u001b[0;31m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0;32m--> 578 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMatMul4Bit\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0m\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"ipdb> exit\n"
]
}
],
"source": [
"# NF4 quantized\n",
"for x in inputs:\n",
" qlora_model = create_qlora_model(with_lora=False)\n",
" qlora_model.to(\"cuda\", torch.bfloat16);\n",
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
" output = qlora_model(x.to(\"cuda\"))\n",
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
" output, qlora_model = None, None\n",
" free_memory()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2eafab62-d3bd-4f01-a96e-fb0454010414",
"metadata": {},
"outputs": [],
"source": [
"qlora_model = create_qlora_model(\"DEBUG\", with_lora=False)\n",
"qlora_model.to(\"cuda\", torch.bfloat16);"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "eacb822f-b461-4cf0-8d79-79558dcf0ac3",
"metadata": {},
"outputs": [],
"source": [
"m = qlora_model.model.layers[0].self_attn.q_proj"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "6d75d2cf-24d0-49c4-8968-086bc03c4412",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m.weight.bnb_quantized"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "f7f7c497-3a73-4603-b13e-edcd4dc62d67",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.0"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"2048*2048/2097152"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "d8c5b09f-6283-4680-8ce0-332dadcb93af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "eee59d02-d87b-48b9-af24-f9988e2fab54",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated (model): 0.868 GB\n",
"Memory allocated torch.Size([1, 512]): 2.195 GB\n",
"Memory allocated (model): 0.868 GB\n",
"Memory allocated torch.Size([1, 1024]): 3.523 GB\n",
"Memory allocated (model): 0.868 GB\n",
"Memory allocated torch.Size([1, 2048]): 6.176 GB\n",
"Memory allocated (model): 0.868 GB\n",
"Memory allocated torch.Size([1, 4096]): 11.510 GB\n"
]
}
],
"source": [
"# NF4 quantized + LORA\n",
"for x in inputs:\n",
" qlora_model = create_qlora_model(with_lora=True)\n",
" qlora_model.to(\"cuda\", torch.bfloat16);\n",
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
" output = qlora_model(x.to(\"cuda\"))\n",
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
" output, qlora_model = None, None\n",
" free_memory()"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "28fc68af-b2e7-4c81-8481-b6dbc8797a78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Memory allocated: 0.009 GB\n"
]
}
],
"source": [
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "221978bb-8bf4-490e-b013-c1a8526c6a20",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "19d833e6-e112-4461-a86c-bad05ff1037c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bf7942d-ac20-46f6-a34b-e34a456c449b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1333777a-425b-44d8-a4b6-16b7ffd9eb50",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "02aeab04-7157-458f-ad0c-f28f1c126970",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "99fb7a14-32de-4167-a028-498074ff7393",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1438638-8353-46e6-9c6b-33ff02c8c55c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3763ce5-3f25-4a55-accb-003fec2675c0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d60b942-9d0a-46fb-9666-150463323d33",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "821da14b-65d8-44ee-875a-e31321e462e7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment