KeremTurgutlu/exp.ipynb

## exp.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d30779b0-0df2-445a-829d-fc3b243c462c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/cuda_setup/main.py:107: UserWarning: \n",
      "\n",
      "================================================================================\n",
      "WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n",
      "BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n",
      "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n",
      "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n",
      "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
      "Loading CUDA version: BNB_CUDA_VERSION=123\n",
      "================================================================================\n",
      "\n",
      "\n",
      "  warn((f'\\n\\n{\"=\"*80}\\n'\n",
      "/home/paperspace/miniconda3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import bitsandbytes as bnb\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from transformers import AutoModelForCausalLM\n",
    "from transformers.utils.quantization_config import BitsAndBytesConfig\n",
    "from transformers.pytorch_utils import Conv1D\n",
    "\n",
    "import transformers\n",
    "from transformers import LlamaConfig, LlamaForCausalLM\n",
    "from transformers.integrations.bitsandbytes import replace_with_bnb_linear\n",
    "from transformers.utils.quantization_config import BitsAndBytesConfig\n",
    "from transformers.models.llama.modeling_llama import LlamaDecoderLayer\n",
    "\n",
    "from peft.tuners.lora.config import LoraConfig\n",
    "from peft.mapping import get_peft_model\n",
    "from peft.utils.peft_types import *\n",
    "\n",
    "import gc\n",
    "import inspect\n",
    "from accelerate.utils import set_seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b8a001c3-4941-44dc-97b0-dd9f67c5148a",
   "metadata": {},
   "outputs": [],
   "source": [
    "transformers.logging.set_verbosity_warning()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8bda461b-c894-4c8b-8d43-a3023a9570bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def malloc_in_gb():\n",
    "    return torch.cuda.memory_allocated()/1e9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "18e63dde-9528-4315-88df-4c7bea0db6ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_model_size_config(model_size):\n",
    "    if model_size == \"DEBUG\":\n",
    "        model_size_config = dict(hidden_size=128,\n",
    "                                num_hidden_layers=2,\n",
    "                                num_attention_heads=2,\n",
    "                                num_key_value_heads=2,\n",
    "                                intermediate_size=256)\n",
    "    elif model_size == \"60M\":\n",
    "        model_size_config = dict(hidden_size=512,\n",
    "                                num_hidden_layers=4,\n",
    "                                num_attention_heads=4,\n",
    "                                num_key_value_heads=4,\n",
    "                                intermediate_size=1024)\n",
    "    elif model_size == \"120M\":\n",
    "        model_size_config = dict(hidden_size=768,\n",
    "                                num_hidden_layers=12,\n",
    "                                num_attention_heads=12,\n",
    "                                num_key_value_heads=12,\n",
    "                                intermediate_size=1536)\n",
    "    elif model_size == \"290M\":\n",
    "        model_size_config = dict(hidden_size=1024,\n",
    "                                num_hidden_layers=12,\n",
    "                                num_attention_heads=16,\n",
    "                                num_key_value_heads=16,\n",
    "                                intermediate_size=4096)\n",
    "    elif model_size == \"1B\":\n",
    "        model_size_config = dict(hidden_size=2048,\n",
    "                                num_hidden_layers=24,\n",
    "                                num_attention_heads=16,\n",
    "                                num_key_value_heads=16,\n",
    "                                intermediate_size=4096)\n",
    "    elif model_size == \"7B\":\n",
    "        model_size_config = {}\n",
    "    return model_size_config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8bae5ba6-f4cb-44a7-9191-89bab9e930f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_model(model_size=\"1B\"):\n",
    "    model_size_config = get_model_size_config(model_size)\n",
    "    # download model weights and config files.\n",
    "    config = LlamaConfig()\n",
    "    config.update(model_size_config)\n",
    "    model = LlamaForCausalLM(config)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a04c9743-43b7-451f-90d9-ff7a3201f4e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def free_memory():\n",
    "    gc.collect()\n",
    "    torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0af5bfb5-5d40-4bcc-a3f5-4c5520f5398b",
   "metadata": {},
   "outputs": [],
   "source": [
    "set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7a3990ca-6cd7-47de-813c-442802520487",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.000 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8fcce7e1-f68e-497a-bb61-fcf7d0780717",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = create_model(\"DEBUG\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4995b7d8-9f80-4752-8224-7a74cfa22f76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.float32, device(type='cpu'))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_size = model.model.embed_tokens.weight.size(0)\n",
    "model.dtype, model.device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d65fc367-2907-422b-90a8-547f4f3c5bb7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.000 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "b90ed1a0-67f1-4c70-8b49-7b5ece6b96a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 2.311 GB\n"
     ]
    }
   ],
   "source": [
    "model.to(\"cuda\", torch.bfloat16);\n",
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "96be6449-a666-4168-b3df-c9f48968973b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "model = None\n",
    "free_memory()\n",
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "daffded3-7536-453c-86ac-877f677e955e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 2.311 GB\n"
     ]
    }
   ],
   "source": [
    "model = create_model()\n",
    "model.to(\"cuda\", torch.bfloat16);\n",
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "72909c76-0a7d-406b-9c4f-e848e866b92a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "model = None\n",
    "free_memory()\n",
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "51c5f80a-5745-4cd3-8ee9-5df7f3572062",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.018 GB\n"
     ]
    }
   ],
   "source": [
    "# assume packed sequences where max_seqlen = sl\n",
    "inputs = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,4096]]\n",
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "5b8d61f6-4d49-4689-8e8e-b4d044d03720",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated (model): 2.311 GB\n",
      "Memory allocated torch.Size([1, 512]): 3.605 GB\n",
      "Memory allocated (model): 2.311 GB\n",
      "Memory allocated torch.Size([1, 1024]): 4.899 GB\n",
      "Memory allocated (model): 2.311 GB\n",
      "Memory allocated torch.Size([1, 2048]): 7.484 GB\n",
      "Memory allocated (model): 2.311 GB\n",
      "Memory allocated torch.Size([1, 4096]): 12.683 GB\n"
     ]
    }
   ],
   "source": [
    "for x in inputs:\n",
    "    model = create_model()\n",
    "    model.to(\"cuda\", torch.bfloat16);\n",
    "    print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
    "    output = model(x.to(\"cuda\"))\n",
    "    print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
    "    output, model = None, None\n",
    "    free_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "cbc77ccd-05bf-417e-bab5-11e9370e2b7c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48fb26df-a649-43da-bd14-f095e9913ab4",
   "metadata": {},
   "source": [
    "### LoRA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4d24d2f1-25bb-432c-be9f-401bd0ff561f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_lora_model(model_size=\"1B\"):\n",
    "    model_size_config = get_model_size_config(model_size)\n",
    "    # download model weights and config files.\n",
    "    config = LlamaConfig()\n",
    "    config.update(model_size_config)\n",
    "    model = LlamaForCausalLM(config)\n",
    "    peft_config = LoraConfig(\n",
    "        task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
    "    )\n",
    "    model = get_peft_model(model, peft_config)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "6b8fb52e-a2b2-4774-9ec4-28642d86dbd9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "45b7f215-31e6-4470-9e16-59ffdbc437f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "lora_model = create_lora_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "c1eab94a-e0ef-4b90-81ce-75dfeadd289d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for n, p in lora_model.named_parameters():\n",
    "    if p.device.type == \"meta\":\n",
    "        print(n, p.requires_grad, p.device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "38806e17-b7ec-4e29-9d26-a5901cac3d32",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainable_params, untrainable_params = [],[] \n",
    "for n, p in lora_model.named_parameters():\n",
    "    if p.requires_grad: trainable_params.append(n)\n",
    "    if not p.requires_grad: untrainable_params.append(n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "71c2bc73-69d9-4b30-a249-0f79a1781240",
   "metadata": {},
   "outputs": [],
   "source": [
    "# trainable_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "3d793005-bdc6-4739-a423-247fcdfe3be8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "9347c1ce-8ec4-4adf-8cf7-ead79664c75d",
   "metadata": {},
   "outputs": [],
   "source": [
    "lora_model = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "7dbcd854-77a7-4917-9de5-4c1c44797e2c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated (model): 2.315 GB\n",
      "Memory allocated torch.Size([1, 512]): 3.443 GB\n",
      "Memory allocated (model): 2.315 GB\n",
      "Memory allocated torch.Size([1, 1024]): 4.572 GB\n",
      "Memory allocated (model): 2.315 GB\n",
      "Memory allocated torch.Size([1, 2048]): 6.826 GB\n",
      "Memory allocated (model): 2.315 GB\n",
      "Memory allocated torch.Size([1, 4096]): 11.364 GB\n"
     ]
    }
   ],
   "source": [
    "for x in inputs:\n",
    "    lora_model = create_lora_model()\n",
    "    lora_model.to(\"cuda\", torch.bfloat16);\n",
    "    print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
    "    output = lora_model(x.to(\"cuda\"))\n",
    "    print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
    "    output, lora_model = None, None\n",
    "    free_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "41e23d9b-288f-4974-9e7e-71d87e14995a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31e01883-2551-4784-93a3-e9ca651feb36",
   "metadata": {},
   "source": [
    "### QLoRA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fe1306d6-051a-42cb-a313-21c4849d70aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_with_bnb_4bit_linear(\n",
    "    model,\n",
    "    modules_to_not_convert=None,\n",
    "    current_key_name=None,\n",
    "    quantization_config=None,\n",
    "    has_been_replaced=False,\n",
    "    quant_storage=torch.uint8\n",
    "):\n",
    "    \"\"\"\n",
    "    Private method that wraps the recursion for module replacement.\n",
    "\n",
    "    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.\n",
    "    \"\"\"\n",
    "    for name, module in model.named_children():\n",
    "        if current_key_name is None:\n",
    "            current_key_name = []\n",
    "        current_key_name.append(name)\n",
    "\n",
    "        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:\n",
    "            # Check if the current key is not in the `modules_to_not_convert`\n",
    "            if not any(key in \".\".join(current_key_name) for key in modules_to_not_convert):\n",
    "                # with init_empty_weights():\n",
    "                if isinstance(module, Conv1D):\n",
    "                    in_features, out_features = module.weight.shape\n",
    "                else:\n",
    "                    in_features = module.in_features\n",
    "                    out_features = module.out_features\n",
    "\n",
    "                    model._modules[name] = bnb.nn.Linear4bit(\n",
    "                        in_features,\n",
    "                        out_features,\n",
    "                        module.bias is not None,\n",
    "                        quantization_config.bnb_4bit_compute_dtype,\n",
    "                        compress_statistics=quantization_config.bnb_4bit_use_double_quant,\n",
    "                        quant_type=quantization_config.bnb_4bit_quant_type,\n",
    "                        quant_storage=quant_storage\n",
    "                    )\n",
    "                    has_been_replaced = True\n",
    "                # Store the module class in case we need to transpose the weight later\n",
    "                model._modules[name].source_cls = type(module)\n",
    "                # Force requires grad to False to avoid unexpected errors\n",
    "                model._modules[name].requires_grad_(False)\n",
    "        if len(list(module.children())) > 0:\n",
    "            _, has_been_replaced = replace_with_bnb_4bit_linear(\n",
    "                module,\n",
    "                modules_to_not_convert,\n",
    "                current_key_name,\n",
    "                quantization_config,\n",
    "                has_been_replaced=has_been_replaced,\n",
    "            )\n",
    "        # Remove the last key for recursion\n",
    "        current_key_name.pop(-1)\n",
    "    return model, has_been_replaced"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b5f71873-662f-40e8-be83-cc2ef63cd561",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_qlora_model(model_size=\"1B\", with_lora=False):\n",
    "    \n",
    "    model_size_config = get_model_size_config(model_size)\n",
    "    \n",
    "    # download model weights and config files.\n",
    "    config = LlamaConfig()\n",
    "    config.update(model_size_config)\n",
    "    model = LlamaForCausalLM(config)\n",
    "    qconfig = BitsAndBytesConfig(load_in_4bit=True, \n",
    "                       bnb_4bit_quant_type=\"nf4\",\n",
    "                       bnb_4bit_use_double_quant=False,\n",
    "                       bnb_4bit_compute_dtype=torch.bfloat16)\n",
    "    model, has_been_replaced = replace_with_bnb_4bit_linear(model, modules_to_not_convert=[\"lm_head\"], quantization_config=qconfig)\n",
    "    assert has_been_replaced\n",
    "    if with_lora:\n",
    "        peft_config = LoraConfig(\n",
    "            task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
    "        )\n",
    "        model = get_peft_model(model, peft_config)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "fa4464d7-b904-49d8-bbb4-afa3e4e08d91",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.000 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7751158a-b150-4b7e-859b-c2fe1f951450",
   "metadata": {},
   "outputs": [],
   "source": [
    "# qlora_model = create_qlora_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "062f1d49-97f8-4808-9b89-557fa0c31657",
   "metadata": {},
   "outputs": [],
   "source": [
    "# trainable_params, untrainable_params = [],[] \n",
    "# for n, p in qlora_model.named_parameters():\n",
    "#     if p.requires_grad: trainable_params.append(n)\n",
    "#     if not p.requires_grad: untrainable_params.append(n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0fa11aa3-6db3-4328-bfb6-ba88be877e16",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# for n, p in qlora_model.named_parameters():\n",
    "#     print(n, p.device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0ded9a20-2774-48a6-82f2-8c139eca5aec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ba3790ce-e7a5-4141-b01b-6b490d616dd8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# for n, p in qlora_model.named_parameters():\n",
    "#     if p.device.type == \"meta\":\n",
    "#         print(n, p.requires_grad, p.device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5a761ec8-6010-4367-b2ce-fa02f41a43c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated (model): 0.856 GB\n",
      "> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(568)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m    566 \u001b[0;31m    \u001b[0;32massert\u001b[0m \u001b[0mquant_state\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    567 \u001b[0;31m    \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m--> 568 \u001b[0;31m    \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequires_grad\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    569 \u001b[0;31m        \u001b[0;32mif\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    570 \u001b[0;31m            \u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "ipdb>  A.shape, A.dtype\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(torch.Size([1, 512, 2048]), torch.bfloat16)\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "ipdb>  B.shape, B.dtype\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(torch.Size([1, 2097152]), torch.uint8)\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "ipdb>  A.numel() == A.shape[-1] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "ipdb>  n\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> \u001b[0;32m/home/paperspace/workdir/git/bitsandbytes_fork/bitsandbytes/autograd/_functions.py\u001b[0m(578)\u001b[0;36mmatmul_4bit\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m    574 \u001b[0;31m            \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    575 \u001b[0;31m                \u001b[0mout\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    576 \u001b[0;31m            \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m    577 \u001b[0;31m    \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\u001b[0;32m--> 578 \u001b[0;31m        \u001b[0;32mreturn\u001b[0m \u001b[0mMatMul4Bit\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquant_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0m\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "ipdb>  exit\n"
     ]
    }
   ],
   "source": [
    "# NF4 quantized\n",
    "for x in inputs:\n",
    "    qlora_model = create_qlora_model(with_lora=False)\n",
    "    qlora_model.to(\"cuda\", torch.bfloat16);\n",
    "    print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
    "    output = qlora_model(x.to(\"cuda\"))\n",
    "    print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
    "    output, qlora_model = None, None\n",
    "    free_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2eafab62-d3bd-4f01-a96e-fb0454010414",
   "metadata": {},
   "outputs": [],
   "source": [
    "qlora_model = create_qlora_model(\"DEBUG\", with_lora=False)\n",
    "qlora_model.to(\"cuda\", torch.bfloat16);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "eacb822f-b461-4cf0-8d79-79558dcf0ac3",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = qlora_model.model.layers[0].self_attn.q_proj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "6d75d2cf-24d0-49c4-8968-086bc03c4412",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m.weight.bnb_quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "f7f7c497-3a73-4603-b13e-edcd4dc62d67",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2048*2048/2097152"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "d8c5b09f-6283-4680-8ce0-332dadcb93af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "eee59d02-d87b-48b9-af24-f9988e2fab54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated (model): 0.868 GB\n",
      "Memory allocated torch.Size([1, 512]): 2.195 GB\n",
      "Memory allocated (model): 0.868 GB\n",
      "Memory allocated torch.Size([1, 1024]): 3.523 GB\n",
      "Memory allocated (model): 0.868 GB\n",
      "Memory allocated torch.Size([1, 2048]): 6.176 GB\n",
      "Memory allocated (model): 0.868 GB\n",
      "Memory allocated torch.Size([1, 4096]): 11.510 GB\n"
     ]
    }
   ],
   "source": [
    "# NF4 quantized + LORA\n",
    "for x in inputs:\n",
    "    qlora_model = create_qlora_model(with_lora=True)\n",
    "    qlora_model.to(\"cuda\", torch.bfloat16);\n",
    "    print(f\"Memory allocated (model): {malloc_in_gb():.3f} GB\")\n",
    "    output = qlora_model(x.to(\"cuda\"))\n",
    "    print(f\"Memory allocated {x.size()}: {malloc_in_gb():.3f} GB\")\n",
    "    output, qlora_model = None, None\n",
    "    free_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "28fc68af-b2e7-4c81-8481-b6dbc8797a78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory allocated: 0.009 GB\n"
     ]
    }
   ],
   "source": [
    "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "221978bb-8bf4-490e-b013-c1a8526c6a20",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19d833e6-e112-4461-a86c-bad05ff1037c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bf7942d-ac20-46f6-a34b-e34a456c449b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1333777a-425b-44d8-a4b6-16b7ffd9eb50",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02aeab04-7157-458f-ad0c-f28f1c126970",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99fb7a14-32de-4167-a028-498074ff7393",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1438638-8353-46e6-9c6b-33ff02c8c55c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3763ce5-3f25-4a55-accb-003fec2675c0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d60b942-9d0a-46fb-9666-150463323d33",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "821da14b-65d8-44ee-875a-e31321e462e7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}