Last active
April 15, 2024 15:58
-
-
Save shizheng-rlfresh/6d7286db1838ae4f89d8332a442b948b to your computer and use it in GitHub Desktop.
Custom Pytorch Fine-tune Gemma 7B 4bit model through `Unsloth AI` FastLanguageModel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "c196026d-4e1f-4ed7-9a5d-b8cb48b327fe", | |
"metadata": {}, | |
"source": [ | |
"> https://github.com/huggingface/transformers/pull/15622 - optimizer issue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "91b63f14-f4c4-42ad-a552-a9557f1434ff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!! nvidia-smi | grep -B 0 \"W\" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "8e08f700-d729-43f0-bc14-f18c79ab28ef", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", | |
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", | |
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", | |
"os.environ[\"ACCELERATE_MIXED_PRECISION\"] = \"fp16\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "fefa89c7-cdfc-48e2-b17e-afd9c293d54b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from unsloth import FastLanguageModel\n", | |
"import torch\n", | |
"from torch import nn\n", | |
"from trl import DataCollatorForCompletionOnlyLM\n", | |
"import numpy as np\n", | |
"from torch.utils.data import DataLoader\n", | |
"import bitsandbytes as bnb\n", | |
"from torch.optim import lr_scheduler\n", | |
"from functools import partial\n", | |
"from transformers import get_scheduler\n", | |
"from tqdm.auto import tqdm\n", | |
"from accelerate import Accelerator\n", | |
"from peft import PeftModel" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4671b113-478b-416f-8d86-c6ea546dc99a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# unsloth notebook template\n", | |
"\n", | |
"max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", | |
"dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", | |
"load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", | |
"\n", | |
"# 4bit pre quantized models we support for 4x faster downloading + no OOMs.\n", | |
"fourbit_models = [\n", | |
" \"unsloth/mistral-7b-bnb-4bit\",\n", | |
" \"unsloth/mistral-7b-instruct-v0.2-bnb-4bit\",\n", | |
" \"unsloth/llama-2-7b-bnb-4bit\",\n", | |
" \"unsloth/gemma-7b-bnb-4bit\",\n", | |
" \"unsloth/gemma-7b-it-bnb-4bit\", # Instruct version of Gemma 7b\n", | |
" \"unsloth/gemma-2b-bnb-4bit\",\n", | |
" \"unsloth/gemma-2b-it-bnb-4bit\", # Instruct version of Gemma 2b\n", | |
"] # More models at https://huggingface.co/unsloth\n", | |
"\n", | |
"model, tokenizer = FastLanguageModel.from_pretrained(\n", | |
" model_name = \"unsloth/gemma-7b-bnb-4bit\", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", | |
" max_seq_length = max_seq_length,\n", | |
" dtype = dtype,\n", | |
" load_in_4bit = load_in_4bit,\n", | |
" # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", | |
")\n", | |
"\n", | |
"model = FastLanguageModel.get_peft_model(\n", | |
" model,\n", | |
" r = 4, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", | |
" target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", | |
" \"gate_proj\", \"up_proj\", \"down_proj\",],\n", | |
" lora_alpha = 8,\n", | |
" lora_dropout = 0, # Supports any, but = 0 is optimized\n", | |
" bias = \"none\", # Supports any, but = \"none\" is optimized\n", | |
" use_gradient_checkpointing = True,\n", | |
" random_state = 3407,\n", | |
" use_rslora = False, # We support rank stabilized LoRA\n", | |
" loftq_config = None, # And LoftQ\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "dbe1356f-8b1a-4b56-acd3-749f03815e55", | |
"metadata": {}, | |
"source": [ | |
"> FastLanguageModel.get_peft_model also calls `prepare_model_for_kbit_training`\n", | |
">\n", | |
"> and set GC to True\n", | |
">\n", | |
"> Afterwards, `model` is an instance of `PeftModel`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0b1c6912-bd20-46f4-b502-d6049fab59a1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.is_gradient_checkpointing, model.quantization_method, model.dtype,\\\n", | |
"model.is_loaded_in_4bit,\\\n", | |
"model.config.use_cache" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c316f3ad-2caa-4b60-8d41-0a801fda0c87", | |
"metadata": {}, | |
"source": [ | |
"> `use_cache = True` This is a bug with GC enabled,\n", | |
">\n", | |
"> need to force it to be `False`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "29b9ec4c-4832-4ad4-9698-b7f7e5a6feb4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.config.use_cache = False" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4d3b1588-5e43-4ddd-a5fa-16cfa4a8ed10", | |
"metadata": {}, | |
"source": [ | |
"> alpaca template" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "29877503-c054-4f4f-8103-2b497e422028", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# unsloth notebook template\n", | |
"alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", | |
"\n", | |
"### Instruction:\n", | |
"{}\n", | |
"\n", | |
"### Input:\n", | |
"{}\n", | |
"\n", | |
"### Response:\n", | |
"{}\"\"\"\n", | |
"\n", | |
"EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\n", | |
"def formatting_prompts_func(examples):\n", | |
" instructions = examples[\"instruction\"]\n", | |
" inputs = examples[\"input\"]\n", | |
" outputs = examples[\"output\"]\n", | |
" texts = []\n", | |
" for instruction, input, output in zip(instructions, inputs, outputs):\n", | |
" # Must add EOS_TOKEN, otherwise your generation will go on forever!\n", | |
" text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN\n", | |
" texts.append(text)\n", | |
" return { \"text\" : texts, }\n", | |
"pass\n", | |
"\n", | |
"from datasets import load_dataset\n", | |
"dataset = load_dataset(\"yahma/alpaca-cleaned\", split = \"train\")\n", | |
"dataset = dataset.map(formatting_prompts_func, batched = True,)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "656e6103-b2ab-4a61-81a6-d9cd2d90f396", | |
"metadata": {}, | |
"source": [ | |
"> The following section replicate Trainer's behavior of\n", | |
">\n", | |
"> 1. process data, e.g., collator, dataloader, etc\n", | |
">\n", | |
"> 2. optimizer, e.g., args, groups, etc\n", | |
">" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b7316e29-6630-462f-987c-93f8f3fc747c", | |
"metadata": {}, | |
"source": [ | |
"> 1.1. tokenization - no padding but with truncation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e14cca56-1e89-41d9-bd3c-ee4704e996e6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tokenize_function(example):\n", | |
" outputs = tokenizer(dataset[\"text\"], \n", | |
" padding=False, \n", | |
" truncation=True,\n", | |
" max_length=max_seq_length,\n", | |
" return_overflowing_tokens=False,\n", | |
" return_length=False)\n", | |
" return {\"input_ids\": outputs[\"input_ids\"], \"attention_mask\": outputs[\"attention_mask\"]}\n", | |
"\n", | |
"dataset = dataset.map(tokenize_function, \n", | |
" batched=True, \n", | |
" remove_columns=dataset.column_names,\n", | |
" num_proc=2,\n", | |
" batch_size=1000)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "835e0f3b-f98d-4e03-8b90-c0662e240c65", | |
"metadata": {}, | |
"source": [ | |
"> split dataset into training and testing dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e7823522-93e1-41c1-85de-048c7a88c6b5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = dataset.train_test_split(test_size=0.1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ccdef770-7a4a-4f82-a69d-a96d51138764", | |
"metadata": {}, | |
"source": [ | |
"> 1.2. data collator - this collator would automatically ignore non-assistant token in loss function\n", | |
">\n", | |
"> note: it would not ignore `<eos>` from loss function;\n", | |
"> it would not mask any tokens from attention\n", | |
"\n", | |
"> 1.3. create data loader" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "85804fcc-548a-453d-86c9-902c33fbc318", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"collator = DataCollatorForCompletionOnlyLM(\"### Response:\", tokenizer=tokenizer)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f4522849-f208-4874-aff3-7e14868e180b", | |
"metadata": {}, | |
"source": [ | |
"> add `pin_memory` for data offloading" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "57f1ffbb-ea96-4f37-b8da-d21114eb7538", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"seed = 42\n", | |
"torch.cuda.manual_seed(seed)\n", | |
"torch.manual_seed(seed)\n", | |
"np.random.seed(seed)\n", | |
"train_dataloader = DataLoader(dataset['train'], batch_size=2, shuffle=True, collate_fn=collator, pin_memory=True)\n", | |
"test_dataloader = DataLoader(dataset['test'], batch_size=2, shuffle=True, collate_fn=collator, pin_memory=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "61ba4745-11aa-43ad-a6cd-1f34d2cd4e27", | |
"metadata": {}, | |
"source": [ | |
"> 2. 8 bits AdamW optimizer and lambda linear scheduler\n", | |
">\n", | |
"> Note: there is no difference between AdamW/AdamW32bit/AdamW8bit, the only difference is\n", | |
">\n", | |
"> on `optim_bits`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "fc5c8e43-d9cf-41c6-823b-68a8b43a9c93", | |
"metadata": {}, | |
"source": [ | |
"> 2.1 define group of parameters with weight decay application" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6d6ae549-ba5e-470f-b7d4-752d5ca46f6a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"decay_parameters = get_parameter_names(model, [nn.LayerNorm])\n", | |
"decay_parameters = [name for name in decay_parameters if \"bias\" not in name]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4a70aff7-53bb-443d-8ab7-328f3b5442bf", | |
"metadata": {}, | |
"source": [ | |
"> 2.2 define group of parameters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "41674641-8b70-4358-bb4f-3026ca2ceeba", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"learning_rate = 2e-4\n", | |
"beta1, beta2 = 0.9, 0.999\n", | |
"optim_bits = 8\n", | |
"is_paged = False\n", | |
"epsilon = 1e-8\n", | |
"weight_decay = 1e-2\n", | |
"\n", | |
"optimizer_cls = AdamW\n", | |
"optimizer_kwargs = {\n", | |
" \"lr\": learning_rate,\n", | |
" \"betas\": (beta1, beta2),\n", | |
" \"eps\": epsilon,\n", | |
" \"optim_bits\": optim_bits,\n", | |
" \"is_paged\": is_paged,\n", | |
"}\n", | |
"\n", | |
"optimizer_grouped_parameters = [\n", | |
" {\n", | |
" \"params\": [\n", | |
" p for n, p in model.named_parameters() if (n in decay_parameters and p.requires_grad)\n", | |
" ],\n", | |
" \"weight_decay\": weight_decay,\n", | |
" },\n", | |
" {\n", | |
" \"params\": [\n", | |
" p for n, p in model.named_parameters() if (n not in decay_parameters and p.requires_grad)\n", | |
" ],\n", | |
" \"weight_decay\": 0.0,\n", | |
" },\n", | |
" ]\n", | |
"\n", | |
"optimizer = bnb.optim.AdamW(optimizer_grouped_parameters, **optimizer_kwargs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1acc434e-b4aa-4042-99f1-7476cb9c82d6", | |
"metadata": {}, | |
"source": [ | |
"! This is one of the trickest issue that was identified from HG Transformers codebase\n", | |
"> 2.3 apply `32bit` for embedding layer\n", | |
">\n", | |
"> this might be irrelevant as QLoRA does not care about Embedding layer\n", | |
">\n", | |
"> Note: see `issue` in transformer repo -\n", | |
">\n", | |
"> > https://github.com/huggingface/transformers/issues/14819#issuecomment-1016017746\n", | |
"> > https://github.com/huggingface/transformers/pull/15622\n", | |
"> >\n", | |
"> manually register the change of 32bit through `source code` of `transformers/src/transformers\n", | |
"/trainer.py`\n", | |
"> > https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py#L1052\n", | |
"> >\n", | |
"> Note: According to the author of `Bitsandbytes`, with `\"weight\"` referencing the name of parameters,\n", | |
"> it is no longer an issue of being performing this registration before or after moving model to GPU\n", | |
"> And, we always directly load model to GPU as default" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e5bcfb15-947b-4829-bac2-b980d5920d56", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"manager = bnb.optim.GlobalOptimManager.get_instance()\n", | |
"\n", | |
"skipped = 0\n", | |
"for module in model.modules():\n", | |
" if isinstance(module, nn.Embedding):\n", | |
" skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())\n", | |
" print(f\"skipped {module}: {skipped/2**20}M params\")\n", | |
" manager.register_module_override(module, \"weight\", {\"optim_bits\": 32})\n", | |
" print(f\"bitsandbytes: will optimize {module} in fp32\")\n", | |
"print(f\"skipped: {skipped/2**20}M params\")\n", | |
"\n", | |
"print('\\n\\n\\n',optimizer.mng.module_weight_config_triple)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "11010d0f-92ab-4a9b-b6d2-38ab394d3dcc", | |
"metadata": {}, | |
"source": [ | |
"> 2.4 define scheduler - replicate default scheduler, i.e., `LambdaLR`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ff65641f-47b5-4948-b2dc-6ae7f5c73c35", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"num_training_steps = 60\n", | |
"num_warmup_steps = 5\n", | |
"name = \"linear\"\n", | |
"scheduler = get_scheduler(name=name,\n", | |
" optimizer=optimizer,\n", | |
" num_warmup_steps=num_warmup_steps,\n", | |
" num_training_steps=num_training_steps\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "78dd7bca-cf70-45ff-a197-ea42f613cad0", | |
"metadata": {}, | |
"source": [ | |
"> 2.6 prepare accelerator\n", | |
">\n", | |
"> Note: specify mixed precision - however, this might not help reduce memory" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e65005c7-869d-4161-9bd2-cdf0768586d5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"accelerator = Accelerator(mixed_precision=\"fp16\", gradient_accumulation_steps=4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e228483a-f038-40d2-9a2f-3f0e86608eaf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model, optimizer, train_dataloader, scheduler = accelerator.prepare(\n", | |
" model, optimizer, train_dataloader, scheduler\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "597aa69e-07f0-4aea-85e7-ded6934a7f7d", | |
"metadata": {}, | |
"source": [ | |
"> The following section is the main training loop\n", | |
">\n", | |
"> merely for demonstration, nothing serious for model loss or accuracy\n", | |
">\n", | |
"> and we are completely ignoring evaluation for this demo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "bd07d7b3-0120-4d78-8cff-cb3cfad874d0", | |
"metadata": {}, | |
"source": [ | |
"> 3.1 Get initial memory stats" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "22526057-117d-432b-b7ee-80fa5aca0adf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Unsloth template continues\n", | |
"gpu_stats = torch.cuda.get_device_properties(0)\n", | |
"start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", | |
"max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", | |
"print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", | |
"print(f\"{start_gpu_memory} GB of memory reserved.\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "979b781b-0e70-445a-8833-a5f38adccc75", | |
"metadata": {}, | |
"source": [ | |
"> 3.2. `with accelerator.accumulate(model):` saves memory for some reason even if\n", | |
">\n", | |
"> `accumulation_step = 1` which is equivalent to no accumulation\n", | |
">\n", | |
"> Note that `GC` is essentially making less memory consumption" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "c7dfd5e4-df3b-4c32-9b95-c97359825eb4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"num_epochs = 1\n", | |
"total_loss = 0\n", | |
"\n", | |
"progress_bar = tqdm(range(num_training_steps))\n", | |
"for epoch in range(num_epochs):\n", | |
" model.train()\n", | |
" step_cnt = 0\n", | |
" for batch in train_dataloader:\n", | |
" # adding accumulation even without actually accumation\n", | |
" # this saves memory between iterations\n", | |
" with accelerator.accumulate(model):\n", | |
" outputs = model(**batch)\n", | |
" loss = outputs.loss\n", | |
" total_loss += loss.detach().float()\n", | |
" accelerator.backward(loss)\n", | |
" \n", | |
" optimizer.step()\n", | |
" scheduler.step()\n", | |
" optimizer.zero_grad()\n", | |
" progress_bar.update(1)\n", | |
"\n", | |
" print(f'iter: {step_cnt}, running_loss: {loss.item()}') \n", | |
" \n", | |
" if step_cnt + 1 > num_training_steps:\n", | |
" break\n", | |
" step_cnt += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "12aec925-91ec-49a4-9492-0e35e1945397", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# modified unsloth notebook template\n", | |
"used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", | |
"used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n", | |
"used_percentage = round(used_memory /max_memory*100, 3)\n", | |
"lora_percentage = round(used_memory_for_lora/max_memory*100, 3)\n", | |
"\n", | |
"print(f\"Peak reserved memory = {used_memory} GB.\")\n", | |
"print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n", | |
"print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n", | |
"print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "31cf165d-1216-4dc8-8006-0f3cae62ba27", | |
"metadata": {}, | |
"source": [ | |
"> 4. the following tempate is from `unsloth AI` for inference\n", | |
">\n", | |
"> Note: this is irelevant to the demo of this notebook.\n", | |
"> For completeness, we keep it here" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d47e51c9-9270-49de-8f77-dc07eab8ac8a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# unsloth AI template \n", | |
"# alpaca_prompt = Copied from above\n", | |
"FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", | |
"inputs = tokenizer(\n", | |
"[\n", | |
" alpaca_prompt.format(\n", | |
" \"Continue the fibonnaci sequence.\", # instruction\n", | |
" \"1, 1, 2, 3, 5, 8\", # input\n", | |
" \"\", # output - leave this blank for generation!\n", | |
" )\n", | |
"], return_tensors = \"pt\").to(\"cuda\")\n", | |
"\n", | |
"outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)\n", | |
"tokenizer.batch_decode(outputs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "177d182c-8bb2-41db-851f-aa3c82755b78", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# unsloth AI template \n", | |
"# alpaca_prompt = Copied from above\n", | |
"FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", | |
"inputs = tokenizer(\n", | |
"[\n", | |
" alpaca_prompt.format(\n", | |
" \"Continue the fibonnaci sequence.\", # instruction\n", | |
" \"1, 1, 2, 3, 5, 8\", # input\n", | |
" \"\", # output - leave this blank for generation!\n", | |
" )\n", | |
"], return_tensors = \"pt\").to(\"cuda\")\n", | |
"\n", | |
"from transformers import TextStreamer\n", | |
"text_streamer = TextStreamer(tokenizer)\n", | |
"_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "16339efa-e41a-4d2a-b376-f3beecfa5bd8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.save_pretrained(\"lora_model\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f648f48d-ee1a-4149-964e-a9810d390c5d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"exit(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "2c33a77b-3e62-43ac-a3a0-061812797f0b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.18" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This notebook demos how to unbox and
SFTtrainer
class from HF implementation intrl
library.The current status reveals still 15% more memory consumption on T4.
Thanks @unslothai for providing a newly implemented
FastLanguageModel
and free notebook template to the community.