Last active
January 10, 2024 14:29
-
-
Save KeremTurgutlu/2bdc3e9591b527a96b7ad5f9ab25dc82 to your computer and use it in GitHub Desktop.
QLORA Memory Experiments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d30779b0-0df2-445a-829d-fc3b243c462c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/cuda_setup/main.py:107: UserWarning: \n", | |
"\n", | |
"================================================================================\n", | |
"WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n", | |
"BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n", | |
"If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n", | |
"If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n", | |
"For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n", | |
"Loading CUDA version: BNB_CUDA_VERSION=123\n", | |
"================================================================================\n", | |
"\n", | |
"\n", | |
" warn((f'\\n\\n{\"=\"*80}\\n'\n", | |
"/home/paperspace/miniconda3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | |
" from .autonotebook import tqdm as notebook_tqdm\n" | |
] | |
} | |
], | |
"source": [ | |
"import bitsandbytes as bnb\n", | |
"\n", | |
"import torch\n", | |
"import torch.nn as nn\n", | |
"import torch.nn.functional as F\n", | |
"from transformers import AutoModelForCausalLM\n", | |
"from transformers.utils.quantization_config import BitsAndBytesConfig\n", | |
"from transformers.pytorch_utils import Conv1D\n", | |
"\n", | |
"import transformers\n", | |
"from transformers import LlamaConfig, LlamaForCausalLM\n", | |
"from transformers.integrations.bitsandbytes import replace_with_bnb_linear\n", | |
"from transformers.utils.quantization_config import BitsAndBytesConfig\n", | |
"from transformers.models.llama.modeling_llama import LlamaDecoderLayer\n", | |
"\n", | |
"from peft.tuners.lora.config import LoraConfig\n", | |
"from peft.mapping import get_peft_model\n", | |
"from peft.utils.peft_types import *\n", | |
"\n", | |
"import gc\n", | |
"import inspect\n", | |
"from accelerate.utils import set_seed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "b8a001c3-4941-44dc-97b0-dd9f67c5148a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"transformers.logging.set_verbosity_warning()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "8bda461b-c894-4c8b-8d43-a3023a9570bb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def malloc_in_gb():\n", | |
" return torch.cuda.memory_allocated()/1e9" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "18e63dde-9528-4315-88df-4c7bea0db6ac", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_model_size_config(model_size):\n", | |
" if model_size == \"DEBUG\":\n", | |
" model_size_config = dict(hidden_size=128,\n", | |
" num_hidden_layers=2,\n", | |
" num_attention_heads=2,\n", | |
" num_key_value_heads=2,\n", | |
" intermediate_size=256)\n", | |
" elif model_size == \"60M\":\n", | |
" model_size_config = dict(hidden_size=512,\n", | |
" num_hidden_layers=4,\n", | |
" num_attention_heads=4,\n", | |
" num_key_value_heads=4,\n", | |
" intermediate_size=1024)\n", | |
" elif model_size == \"120M\":\n", | |
" model_size_config = dict(hidden_size=768,\n", | |
" num_hidden_layers=12,\n", | |
" num_attention_heads=12,\n", | |
" num_key_value_heads=12,\n", | |
" intermediate_size=1536)\n", | |
" elif model_size == \"290M\":\n", | |
" model_size_config = dict(hidden_size=1024,\n", | |
" num_hidden_layers=12,\n", | |
" num_attention_heads=16,\n", | |
" num_key_value_heads=16,\n", | |
" intermediate_size=4096)\n", | |
" elif model_size == \"1B\":\n", | |
" model_size_config = dict(hidden_size=2048,\n", | |
" num_hidden_layers=24,\n", | |
" num_attention_heads=16,\n", | |
" num_key_value_heads=16,\n", | |
" intermediate_size=4096)\n", | |
" elif model_size == \"7B\":\n", | |
" model_size_config = {}\n", | |
" return model_size_config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "8bae5ba6-f4cb-44a7-9191-89bab9e930f5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def create_model(model_size=\"1B\"):\n", | |
" model_size_config = get_model_size_config(model_size)\n", | |
" # download model weights and config files.\n", | |
" config = LlamaConfig()\n", | |
" config.update(model_size_config)\n", | |
" model = LlamaForCausalLM(config)\n", | |
" return model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "a04c9743-43b7-451f-90d9-ff7a3201f4e3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def free_memory():\n", | |
" gc.collect()\n", | |
" torch.cuda.empty_cache()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "0af5bfb5-5d40-4bcc-a3f5-4c5520f5398b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"set_seed(42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "7a3990ca-6cd7-47de-813c-442802520487", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.000 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "8fcce7e1-f68e-497a-bb61-fcf7d0780717", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = create_model(\"DEBUG\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "4995b7d8-9f80-4752-8224-7a74cfa22f76", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(torch.float32, device(type='cpu'))" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab_size = model.model.embed_tokens.weight.size(0)\n", | |
"model.dtype, model.device" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "d65fc367-2907-422b-90a8-547f4f3c5bb7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.000 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"id": "b90ed1a0-67f1-4c70-8b49-7b5ece6b96a2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 2.311 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"model.to(\"cuda\", torch.bfloat16);\n", | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"id": "96be6449-a666-4168-b3df-c9f48968973b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"model = None\n", | |
"free_memory()\n", | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"id": "daffded3-7536-453c-86ac-877f677e955e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 2.311 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"model = create_model()\n", | |
"model.to(\"cuda\", torch.bfloat16);\n", | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"id": "72909c76-0a7d-406b-9c4f-e848e866b92a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"model = None\n", | |
"free_memory()\n", | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "51c5f80a-5745-4cd3-8ee9-5df7f3572062", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.018 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"# assume packed sequences where max_seqlen = sl\n", | |
"inputs = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,4096]]\n", | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"id": "5b8d61f6-4d49-4689-8e8e-b4d044d03720", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated (model): 2.311 GB\n", | |
"Memory allocated torch.Size([1, 512]): 3.605 GB\n", | |
"Memory allocated (model): 2.311 GB\n", | |
"Memory allocated torch.Size([1, 1024]): 4.899 GB\n", | |
"Memory allocated (model): 2.311 GB\n", | |
"Memory allocated torch.Size([1, 2048]): 7.484 GB\n", | |
"Memory allocated (model): 2.311 GB\n", | |
"Memory allocated torch.Size([1, 4096]): 12.683 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"for x in inputs:\n", | |
" model = create_model()\n", | |
" model.to(\"cuda\", torch.bfloat16);\n", | |
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n", | |
" output = model(x.to(\"cuda\"))\n", | |
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n", | |
" output, model = None, None\n", | |
" free_memory()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"id": "cbc77ccd-05bf-417e-bab5-11e9370e2b7c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "48fb26df-a649-43da-bd14-f095e9913ab4", | |
"metadata": {}, | |
"source": [ | |
"### LoRA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "4d24d2f1-25bb-432c-be9f-401bd0ff561f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def create_lora_model(model_size=\"1B\"):\n", | |
" model_size_config = get_model_size_config(model_size)\n", | |
" # download model weights and config files.\n", | |
" config = LlamaConfig()\n", | |
" config.update(model_size_config)\n", | |
" model = LlamaForCausalLM(config)\n", | |
" peft_config = LoraConfig(\n", | |
" task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n", | |
" )\n", | |
" model = get_peft_model(model, peft_config)\n", | |
" return model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"id": "6b8fb52e-a2b2-4774-9ec4-28642d86dbd9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"id": "45b7f215-31e6-4470-9e16-59ffdbc437f1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"lora_model = create_lora_model()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"id": "c1eab94a-e0ef-4b90-81ce-75dfeadd289d", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for n, p in lora_model.named_parameters():\n", | |
" if p.device.type == \"meta\":\n", | |
" print(n, p.requires_grad, p.device)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 85, | |
"id": "38806e17-b7ec-4e29-9d26-a5901cac3d32", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trainable_params, untrainable_params = [],[] \n", | |
"for n, p in lora_model.named_parameters():\n", | |
" if p.requires_grad: trainable_params.append(n)\n", | |
" if not p.requires_grad: untrainable_params.append(n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"id": "71c2bc73-69d9-4b30-a249-0f79a1781240", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# trainable_params" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"id": "3d793005-bdc6-4739-a423-247fcdfe3be8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"id": "9347c1ce-8ec4-4adf-8cf7-ead79664c75d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"lora_model = None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"id": "7dbcd854-77a7-4917-9de5-4c1c44797e2c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated (model): 2.315 GB\n", | |
"Memory allocated torch.Size([1, 512]): 3.443 GB\n", | |
"Memory allocated (model): 2.315 GB\n", | |
"Memory allocated torch.Size([1, 1024]): 4.572 GB\n", | |
"Memory allocated (model): 2.315 GB\n", | |
"Memory allocated torch.Size([1, 2048]): 6.826 GB\n", | |
"Memory allocated (model): 2.315 GB\n", | |
"Memory allocated torch.Size([1, 4096]): 11.364 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"for x in inputs:\n", | |
" lora_model = create_lora_model()\n", | |
" lora_model.to(\"cuda\", torch.bfloat16);\n", | |
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n", | |
" output = lora_model(x.to(\"cuda\"))\n", | |
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n", | |
" output, lora_model = None, None\n", | |
" free_memory()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 90, | |
"id": "41e23d9b-288f-4974-9e7e-71d87e14995a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "31e01883-2551-4784-93a3-e9ca651feb36", | |
"metadata": {}, | |
"source": [ | |
"### QLoRA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "fe1306d6-051a-42cb-a313-21c4849d70aa", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def replace_with_bnb_4bit_linear(\n", | |
" model,\n", | |
" modules_to_not_convert=None,\n", | |
" current_key_name=None,\n", | |
" quantization_config=None,\n", | |
" has_been_replaced=False,\n", | |
" quant_storage=torch.uint8\n", | |
"):\n", | |
" \"\"\"\n", | |
" Private method that wraps the recursion for module replacement.\n", | |
"\n", | |
" Returns the converted model and a boolean that indicates if the conversion has been successfull or not.\n", | |
" \"\"\"\n", | |
" for name, module in model.named_children():\n", | |
" if current_key_name is None:\n", | |
" current_key_name = []\n", | |
" current_key_name.append(name)\n", | |
"\n", | |
" if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:\n", | |
" # Check if the current key is not in the `modules_to_not_convert`\n", | |
" if not any(key in \".\".join(current_key_name) for key in modules_to_not_convert):\n", | |
" # with init_empty_weights():\n", | |
" if isinstance(module, Conv1D):\n", | |
" in_features, out_features = module.weight.shape\n", | |
" else:\n", | |
" in_features = module.in_features\n", | |
" out_features = module.out_features\n", | |
"\n", | |
" model._modules[name] = bnb.nn.Linear4bit(\n", | |
" in_features,\n", | |
" out_features,\n", | |
" module.bias is not None,\n", | |
" quantization_config.bnb_4bit_compute_dtype,\n", | |
" compress_statistics=quantization_config.bnb_4bit_use_double_quant,\n", | |
" quant_type=quantization_config.bnb_4bit_quant_type,\n", | |
" quant_storage=quant_storage\n", | |
" )\n", | |
" has_been_replaced = True\n", | |
" # Store the module class in case we need to transpose the weight later\n", | |
" model._modules[name].source_cls = type(module)\n", | |
" # Force requires grad to False to avoid unexpected errors\n", | |
" model._modules[name].requires_grad_(False)\n", | |
" if len(list(module.children())) > 0:\n", | |
" _, has_been_replaced = replace_with_bnb_4bit_linear(\n", | |
" module,\n", | |
" modules_to_not_convert,\n", | |
" current_key_name,\n", | |
" quantization_config,\n", | |
" has_been_replaced=has_been_replaced,\n", | |
" )\n", | |
" # Remove the last key for recursion\n", | |
" current_key_name.pop(-1)\n", | |
" return model, has_been_replaced" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "b5f71873-662f-40e8-be83-cc2ef63cd561", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def create_qlora_model(model_size=\"1B\", with_lora=False):\n", | |
" \n", | |
" model_size_config = get_model_size_config(model_size)\n", | |
" \n", | |
" # download model weights and config files.\n", | |
" config = LlamaConfig()\n", | |
" config.update(model_size_config)\n", | |
" model = LlamaForCausalLM(config)\n", | |
" qconfig = BitsAndBytesConfig(load_in_4bit=True, \n", | |
" bnb_4bit_quant_type=\"nf4\",\n", | |
" bnb_4bit_use_double_quant=False,\n", | |
" bnb_4bit_compute_dtype=torch.bfloat16)\n", | |
" model, has_been_replaced = replace_with_bnb_4bit_linear(model, modules_to_not_convert=[\"lm_head\"], quantization_config=qconfig)\n", | |
" assert has_been_replaced\n", | |
" if with_lora:\n", | |
" peft_config = LoraConfig(\n", | |
" task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n", | |
" )\n", | |
" model = get_peft_model(model, peft_config)\n", | |
" return model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "fa4464d7-b904-49d8-bbb4-afa3e4e08d91", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.000 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "7751158a-b150-4b7e-859b-c2fe1f951450", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# qlora_model = create_qlora_model()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "062f1d49-97f8-4808-9b89-557fa0c31657", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# trainable_params, untrainable_params = [],[] \n", | |
"# for n, p in qlora_model.named_parameters():\n", | |
"# if p.requires_grad: trainable_params.append(n)\n", | |
"# if not p.requires_grad: untrainable_params.append(n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "0fa11aa3-6db3-4328-bfb6-ba88be877e16", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# for n, p in qlora_model.named_parameters():\n", | |
"# print(n, p.device)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "0ded9a20-2774-48a6-82f2-8c139eca5aec", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "ba3790ce-e7a5-4141-b01b-6b490d616dd8", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# for n, p in qlora_model.named_parameters():\n", | |
"# if p.device.type == \"meta\":\n", | |
"# print(n, p.requires_grad, p.device)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "5a761ec8-6010-4367-b2ce-fa02f41a43c4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated (model): 0.856 GB\n", | |
"> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(568)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n", | |
"\u001b[0;32m 566 \u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mquant_state\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 567 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m--> 568 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequires_grad\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 569 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 570 \u001b[0;31m \u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\n" | |
] | |
}, | |
{ | |
"name": "stdin", | |
"output_type": "stream", | |
"text": [ | |
"ipdb> A.shape, A.dtype\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(torch.Size([1, 512, 2048]), torch.bfloat16)\n" | |
] | |
}, | |
{ | |
"name": "stdin", | |
"output_type": "stream", | |
"text": [ | |
"ipdb> B.shape, B.dtype\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(torch.Size([1, 2097152]), torch.uint8)\n" | |
] | |
}, | |
{ | |
"name": "stdin", | |
"output_type": "stream", | |
"text": [ | |
"ipdb> A.numel() == A.shape[-1] \n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"False\n" | |
] | |
}, | |
{ | |
"name": "stdin", | |
"output_type": "stream", | |
"text": [ | |
"ipdb> n\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(578)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n", | |
"\u001b[0;32m 574 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 575 \u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 576 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m 577 \u001b[0;31m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\u001b[0;32m--> 578 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMatMul4Bit\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0m\n" | |
] | |
}, | |
{ | |
"name": "stdin", | |
"output_type": "stream", | |
"text": [ | |
"ipdb> exit\n" | |
] | |
} | |
], | |
"source": [ | |
"# NF4 quantized\n", | |
"for x in inputs:\n", | |
" qlora_model = create_qlora_model(with_lora=False)\n", | |
" qlora_model.to(\"cuda\", torch.bfloat16);\n", | |
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n", | |
" output = qlora_model(x.to(\"cuda\"))\n", | |
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n", | |
" output, qlora_model = None, None\n", | |
" free_memory()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "2eafab62-d3bd-4f01-a96e-fb0454010414", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"qlora_model = create_qlora_model(\"DEBUG\", with_lora=False)\n", | |
"qlora_model.to(\"cuda\", torch.bfloat16);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "eacb822f-b461-4cf0-8d79-79558dcf0ac3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"m = qlora_model.model.layers[0].self_attn.q_proj" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "6d75d2cf-24d0-49c4-8968-086bc03c4412", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"m.weight.bnb_quantized" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "f7f7c497-3a73-4603-b13e-edcd4dc62d67", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2.0" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"2048*2048/2097152" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"id": "d8c5b09f-6283-4680-8ce0-332dadcb93af", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"id": "eee59d02-d87b-48b9-af24-f9988e2fab54", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated (model): 0.868 GB\n", | |
"Memory allocated torch.Size([1, 512]): 2.195 GB\n", | |
"Memory allocated (model): 0.868 GB\n", | |
"Memory allocated torch.Size([1, 1024]): 3.523 GB\n", | |
"Memory allocated (model): 0.868 GB\n", | |
"Memory allocated torch.Size([1, 2048]): 6.176 GB\n", | |
"Memory allocated (model): 0.868 GB\n", | |
"Memory allocated torch.Size([1, 4096]): 11.510 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"# NF4 quantized + LORA\n", | |
"for x in inputs:\n", | |
" qlora_model = create_qlora_model(with_lora=True)\n", | |
" qlora_model.to(\"cuda\", torch.bfloat16);\n", | |
" print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n", | |
" output = qlora_model(x.to(\"cuda\"))\n", | |
" print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n", | |
" output, qlora_model = None, None\n", | |
" free_memory()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"id": "28fc68af-b2e7-4c81-8481-b6dbc8797a78", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Memory allocated: 0.009 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"id": "221978bb-8bf4-490e-b013-c1a8526c6a20", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "19d833e6-e112-4461-a86c-bad05ff1037c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0bf7942d-ac20-46f6-a34b-e34a456c449b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "1333777a-425b-44d8-a4b6-16b7ffd9eb50", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "02aeab04-7157-458f-ad0c-f28f1c126970", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "99fb7a14-32de-4167-a028-498074ff7393", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d1438638-8353-46e6-9c6b-33ff02c8c55c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "a3763ce5-3f25-4a55-accb-003fec2675c0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3d60b942-9d0a-46fb-9666-150463323d33", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "821da14b-65d8-44ee-875a-e31321e462e7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment