coldfire84/base-vs-instruction-fine-tuned.ipynb

## base-vs-instruction-fine-tuned.ipynb
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/coldfire84/56d9f426d71cc6c6422b25309293eba7/base-vs-instruction-fine-tuned.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GZnoKg_OA07d"
      },
      "source": [
        "# Base Model vs Instruction-Tuned Comparison"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "A larger-scale fine-tuning example, using Llama3, can be seen here: https://colab.research.google.com/drive/1BqXRVQlGvdR5DJvvos8xdAL_y_eLgkSd?usp=drive_open"
      ],
      "metadata": {
        "id": "I4_gaM6vlDj4"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W4IGMoUS9G9c"
      },
      "source": [
        "Install pre-requisites and login tio HuggingFace:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jHhWq4yvmrhT"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "# Uncomment if using with GPU backend/ supported model\n",
        "!pip install -U transformers accelerate\n",
        "!pip install -U datasets"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3xwda9g-o6BJ"
      },
      "source": [
        "Prepare environment: imports and configuration:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HMMUjp1b-40s"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "import torch\n",
        "import datasets\n",
        "import random\n",
        "from transformers import pipeline, set_seed, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification\n",
        "# Login to HF\n",
        "from huggingface_hub import login\n",
        "from google.colab import userdata\n",
        "token = userdata.get('HF_TOKEN')\n",
        "login(token)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b3fbrr4R_D9i"
      },
      "source": [
        "Prepare base `EleutherAI/pythia-70m` and stage initial instruction:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "W-3Ksdn7_Bfr"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "# Helper function to enable model testing\n",
        "def testbasemodel(model_name, args):\n",
        "  model = AutoModelForCausalLM.from_pretrained(\n",
        "      model_name,\n",
        "      # On a Model and Environment that has a GPU...\n",
        "      #device_map=\"auto\",\n",
        "      #torch_dtype=\"auto\",\n",
        "      #trust_remote_code=True,\n",
        "  )\n",
        "  # Get appropriate tokenizer automatically, using HF AutoTokenizer\n",
        "  tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "  pipe = pipeline(\n",
        "      \"text-generation\",\n",
        "      model=model,\n",
        "      tokenizer=tokenizer,\n",
        "  )\n",
        "  output = pipe(\"Does Lamini support generating code?\", **args)\n",
        "  return output[0]['generated_text']\n",
        "# Base Model\n",
        "base_model_name=\"EleutherAI/pythia-70m\"\n",
        "base_generation_args = {\n",
        "     \"max_length\": 50,\n",
        "     \"truncation\": True,\n",
        "}\n",
        "base_output = testbasemodel(base_model_name, base_generation_args)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Review base model output:"
      ],
      "metadata": {
        "id": "_2zZH6dfTmQj"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cRyGpzktYoR7"
      },
      "outputs": [],
      "source": [
        "print(f'Base model output ({base_model_name}): {base_output} ')"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Prepare training dataset:"
      ],
      "metadata": {
        "id": "RdrTbR_wHpCR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def get_tokenize_function(tokenizer, max_length):\n",
        "    \"\"\"\n",
        "    Returns a function to tokenize text data using provided tokenizer with specified maximum length.\n",
        "    This assumes NO prompt template requirement, you may need to adjust accordingly/ needed by your base model.\n",
        "    \"\"\"\n",
        "    def tokenize_function(examples):\n",
        "        # Determine which key(s) are present and should be concatenated\n",
        "        keys = ['question', 'answer', 'input', 'output', 'text']\n",
        "        texts = []\n",
        "        for idx in range(len(examples[next(iter(examples))])):  # Loop through the batch\n",
        "            text = ''\n",
        "            for key in keys:\n",
        "                if key in examples:\n",
        "                    text += examples[key][idx]\n",
        "            texts.append(text)\n",
        "        # Tokenize all texts in the batch with padding and truncation\n",
        "        tokenized_inputs = tokenizer(\n",
        "            texts,\n",
        "            max_length=max_length,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_tensors='np'\n",
        "        )\n",
        "        tokenized_inputs[\"labels\"] = tokenized_inputs[\"input_ids\"]\n",
        "        return tokenized_inputs\n",
        "\n",
        "    return tokenize_function\n",
        "\n",
        "def load_and_tokenize_data(tokenizer, config):\n",
        "    random.seed(42)  # Set a seed for reproducibility\n",
        "    # Load the dataset\n",
        "    if config[\"datasets\"][\"use_hf\"]:\n",
        "        dataset = datasets.load_dataset(config[\"datasets\"][\"path\"])\n",
        "    else:\n",
        "        # Load from local JSON file with a specified split if not using Hugging Face datasets\n",
        "        dataset = datasets.load_dataset(\"json\", data_files=config[\"datasets\"][\"path\"], split='train')\n",
        "    # Process each split available in the dataset\n",
        "    tokenize_func = get_tokenize_function(tokenizer, config[\"model\"]['max_length'])\n",
        "    for split in dataset.keys():\n",
        "        dataset[split] = dataset[split].map(tokenize_func, batched=True)\n",
        "        dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
        "    return dataset\n",
        "\n",
        "def decode_tokens(tokenizer, input_ids):\n",
        "    \"\"\"\n",
        "    Decodes token IDs to readable text using the provided tokenizer.\n",
        "    \"\"\"\n",
        "    return tokenizer.decode(input_ids, skip_special_tokens=True)\n",
        "\n",
        "def display_dataset_info(datasets):\n",
        "    for split_name, split_data in datasets.items():\n",
        "        print(f\"--- {split_name.upper()} SPLIT ---\")\n",
        "        print(f\"Number of examples: {len(split_data)}\")\n",
        "        print(\"Columns:\", split_data.column_names)\n",
        "        print(\"Features:\", split_data.features)\n",
        "        print(\"\\n\")\n",
        "\n",
        "def display_sample_data(datasets, tokenizer, num_samples=5):\n",
        "    for split_name, split_data in datasets.items():\n",
        "        print(f\"--- {split_name.upper()} SPLIT SAMPLES ---\")\n",
        "        sample_data = split_data.shuffle(seed=42).select(range(min(num_samples, len(split_data))))\n",
        "        for idx, example in enumerate(sample_data):\n",
        "            print(f\"Sample {idx+1}:\")\n",
        "            decoded_text = decode_tokens(tokenizer, example['input_ids'])\n",
        "            print(f\"Decoded Text: {decoded_text}\")\n",
        "            print({key: val for key, val in example.items() if key in ['input_ids', 'attention_mask', 'labels']})\n",
        "            print(\"\\n\")\n",
        "\n",
        "def check_dataset_shapes(dataset):\n",
        "    for example in dataset:\n",
        "        input_shape = example['input_ids'].shape\n",
        "        label_shape = example['labels'].shape\n",
        "        if input_shape != label_shape:\n",
        "            print(f\"Mismatch found! Input shape: {input_shape}, Label shape: {label_shape}\")\n",
        "\n",
        "\n",
        "# Configuration settings\n",
        "dataset_path = \"lamini/lamini_docs\"\n",
        "use_hf = True\n",
        "training_config = {\n",
        "    \"model\": {\n",
        "        \"pretrained_name\": base_model_name,\n",
        "        \"max_length\" : 2048\n",
        "    },\n",
        "    \"datasets\": {\n",
        "        \"use_hf\": use_hf,\n",
        "        \"path\": dataset_path\n",
        "    },\n",
        "    \"verbose\": True\n",
        "}\n",
        "# Load tokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained(training_config['model']['pretrained_name'])\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "# Load, tokenize, and split dataset\n",
        "training_datasets = load_and_tokenize_data(tokenizer, training_config)\n",
        "train_dataset, test_dataset, validation_dataset = training_datasets.get('train', None), training_datasets.get('test', None), training_datasets.get('validation', None)\n",
        "print(\"Datasets are loaded and tokenized.\")\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "iOjZcNT9xx__"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Check sample data/ review dataset information:"
      ],
      "metadata": {
        "id": "PrRGfa2bPJRg"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "check_dataset_shapes(train_dataset)\n",
        "display_dataset_info(training_datasets)\n",
        "display_sample_data(training_datasets, tokenizer, num_samples=3)"
      ],
      "metadata": {
        "id": "iStUR87yO9Gp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Train the base model:"
      ],
      "metadata": {
        "id": "0depRbjrTNVm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def train(model, max_steps, output_dir):\n",
        "    # Setup training arguments\n",
        "    training_args = TrainingArguments(\n",
        "        learning_rate=1.0e-5,\n",
        "        num_train_epochs=1,\n",
        "        max_steps=max_steps,\n",
        "        per_device_train_batch_size=1,\n",
        "        output_dir=output_dir,\n",
        "        overwrite_output_dir=True,\n",
        "        disable_tqdm=False,\n",
        "        eval_steps=10,\n",
        "        save_steps=50,\n",
        "        warmup_steps=1,\n",
        "        per_device_eval_batch_size=1,\n",
        "        evaluation_strategy=\"steps\",\n",
        "        logging_strategy=\"steps\",\n",
        "        logging_steps=1,\n",
        "        optim=\"adafactor\",\n",
        "        gradient_accumulation_steps=4,\n",
        "        gradient_checkpointing=False,\n",
        "        load_best_model_at_end=True,\n",
        "        save_total_limit=1,\n",
        "        metric_for_best_model=\"eval_loss\",\n",
        "        greater_is_better=False\n",
        "    )\n",
        "\n",
        "    # Could use SFTrainer from trl, requires rework of dataset and trainer args\n",
        "    # See: https://huggingface.co/docs/trl/sft_trainer\n",
        "    trainer = Trainer(\n",
        "        model=model,\n",
        "        args=training_args,\n",
        "        train_dataset=train_dataset,\n",
        "        eval_dataset=test_dataset\n",
        "    )\n",
        "\n",
        "    # Start training\n",
        "    training_output = trainer.train()\n",
        "    save_dir = f'{output_dir}/final'\n",
        "    trainer.save_model(save_dir)\n",
        "    tokenizer.save_pretrained(save_dir)\n",
        "    print(\"Saved model to:\", save_dir)\n",
        "\n",
        "# Setup\n",
        "model = AutoModelForCausalLM.from_pretrained(base_model_name)\n",
        "max_steps = 100\n",
        "output_dir = f\"{base_model_name}_lamini_{max_steps}_steps\"\n",
        "# Train\n",
        "train(model, max_steps, output_dir)"
      ],
      "metadata": {
        "id": "scdR2kTUTPrw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b326BR8a_Enw"
      },
      "source": [
        "Compare instruction output against the fine-tuned model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bTP3iEVVYliA"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "# Helper function to enable model testing\n",
        "def test_finetuned_model(model_path, tokenizer, args):\n",
        "    \"\"\"\n",
        "    Loads a fine-tuned model and generates text based on the input prompt.\n",
        "    \"\"\"\n",
        "    # Load the fine-tuned model\n",
        "    model = AutoModelForCausalLM.from_pretrained(model_path)\n",
        "    # Set up a text-generation pipeline\n",
        "    pipe = pipeline(\n",
        "        \"text-generation\",\n",
        "        model=model,\n",
        "        tokenizer=tokenizer,\n",
        "    )\n",
        "    # Generate text based on a prompt\n",
        "    prompt = \"Does Lamini support generating code?\"\n",
        "    output = pipe(prompt, **args)\n",
        "    return output[0]['generated_text']\n",
        "\n",
        "# Setup\n",
        "trained_model_path = f'{output_dir}/final'\n",
        "generation_args = {\n",
        "     \"max_length\": 50,\n",
        "     \"truncation\": True,\n",
        "}\n",
        "# Test Fine-Tuned Model\n",
        "tokenizer = AutoTokenizer.from_pretrained(trained_model_path)\n",
        "fine_tuned_output = test_finetuned_model(trained_model_path, tokenizer, generation_args)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"Fine-tuned model output:\", fine_tuned_output)"
      ],
      "metadata": {
        "id": "OF_OSsHbd9oy"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": [],
      "authorship_tag": "ABX9TyPg0akpeqBhjPRclaWNkyat",
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/coldfire84/56d9f426d71cc6c6422b25309293eba7/base-vs-instruction-fine-tuned.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "GZnoKg_OA07d"
	},
	"source": [
	"# Base Model vs Instruction-Tuned Comparison"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"A larger-scale fine-tuning example, using Llama3, can be seen here: https://colab.research.google.com/drive/1BqXRVQlGvdR5DJvvos8xdAL_y_eLgkSd?usp=drive_open"
	],
	"metadata": {
	"id": "I4_gaM6vlDj4"
	}
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "W4IGMoUS9G9c"
	},
	"source": [
	"Install pre-requisites and login tio HuggingFace:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "jHhWq4yvmrhT"
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"# Uncomment if using with GPU backend/ supported model\n",
	"!pip install -U transformers accelerate\n",
	"!pip install -U datasets"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "3xwda9g-o6BJ"
	},
	"source": [
	"Prepare environment: imports and configuration:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "HMMUjp1b-40s"
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"import torch\n",
	"import datasets\n",
	"import random\n",
	"from transformers import pipeline, set_seed, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification\n",
	"# Login to HF\n",
	"from huggingface_hub import login\n",
	"from google.colab import userdata\n",
	"token = userdata.get('HF_TOKEN')\n",
	"login(token)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "b3fbrr4R_D9i"
	},
	"source": [
	"Prepare base `EleutherAI/pythia-70m` and stage initial instruction:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "W-3Ksdn7_Bfr"
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"# Helper function to enable model testing\n",
	"def testbasemodel(model_name, args):\n",
	" model = AutoModelForCausalLM.from_pretrained(\n",
	" model_name,\n",
	" # On a Model and Environment that has a GPU...\n",
	" #device_map=\"auto\",\n",
	" #torch_dtype=\"auto\",\n",
	" #trust_remote_code=True,\n",
	" )\n",
	" # Get appropriate tokenizer automatically, using HF AutoTokenizer\n",
	" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
	" pipe = pipeline(\n",
	" \"text-generation\",\n",
	" model=model,\n",
	" tokenizer=tokenizer,\n",
	" )\n",
	" output = pipe(\"Does Lamini support generating code?\", **args)\n",
	" return output[0]['generated_text']\n",
	"# Base Model\n",
	"base_model_name=\"EleutherAI/pythia-70m\"\n",
	"base_generation_args = {\n",
	" \"max_length\": 50,\n",
	" \"truncation\": True,\n",
	"}\n",
	"base_output = testbasemodel(base_model_name, base_generation_args)"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Review base model output:"
	],
	"metadata": {
	"id": "_2zZH6dfTmQj"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "cRyGpzktYoR7"
	},
	"outputs": [],
	"source": [
	"print(f'Base model output ({base_model_name}): {base_output} ')"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Prepare training dataset:"
	],
	"metadata": {
	"id": "RdrTbR_wHpCR"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"def get_tokenize_function(tokenizer, max_length):\n",
	" \"\"\"\n",
	" Returns a function to tokenize text data using provided tokenizer with specified maximum length.\n",
	" This assumes NO prompt template requirement, you may need to adjust accordingly/ needed by your base model.\n",
	" \"\"\"\n",
	" def tokenize_function(examples):\n",
	" # Determine which key(s) are present and should be concatenated\n",
	" keys = ['question', 'answer', 'input', 'output', 'text']\n",
	" texts = []\n",
	" for idx in range(len(examples[next(iter(examples))])): # Loop through the batch\n",
	" text = ''\n",
	" for key in keys:\n",
	" if key in examples:\n",
	" text += examples[key][idx]\n",
	" texts.append(text)\n",
	" # Tokenize all texts in the batch with padding and truncation\n",
	" tokenized_inputs = tokenizer(\n",
	" texts,\n",
	" max_length=max_length,\n",
	" padding='max_length',\n",
	" truncation=True,\n",
	" return_tensors='np'\n",
	" )\n",
	" tokenized_inputs[\"labels\"] = tokenized_inputs[\"input_ids\"]\n",
	" return tokenized_inputs\n",
	"\n",
	" return tokenize_function\n",
	"\n",
	"def load_and_tokenize_data(tokenizer, config):\n",
	" random.seed(42) # Set a seed for reproducibility\n",
	" # Load the dataset\n",
	" if config[\"datasets\"][\"use_hf\"]:\n",
	" dataset = datasets.load_dataset(config[\"datasets\"][\"path\"])\n",
	" else:\n",
	" # Load from local JSON file with a specified split if not using Hugging Face datasets\n",
	" dataset = datasets.load_dataset(\"json\", data_files=config[\"datasets\"][\"path\"], split='train')\n",
	" # Process each split available in the dataset\n",
	" tokenize_func = get_tokenize_function(tokenizer, config[\"model\"]['max_length'])\n",
	" for split in dataset.keys():\n",
	" dataset[split] = dataset[split].map(tokenize_func, batched=True)\n",
	" dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
	" return dataset\n",
	"\n",
	"def decode_tokens(tokenizer, input_ids):\n",
	" \"\"\"\n",
	" Decodes token IDs to readable text using the provided tokenizer.\n",
	" \"\"\"\n",
	" return tokenizer.decode(input_ids, skip_special_tokens=True)\n",
	"\n",
	"def display_dataset_info(datasets):\n",
	" for split_name, split_data in datasets.items():\n",
	" print(f\"--- {split_name.upper()} SPLIT ---\")\n",
	" print(f\"Number of examples: {len(split_data)}\")\n",
	" print(\"Columns:\", split_data.column_names)\n",
	" print(\"Features:\", split_data.features)\n",
	" print(\"\\n\")\n",
	"\n",
	"def display_sample_data(datasets, tokenizer, num_samples=5):\n",
	" for split_name, split_data in datasets.items():\n",
	" print(f\"--- {split_name.upper()} SPLIT SAMPLES ---\")\n",
	" sample_data = split_data.shuffle(seed=42).select(range(min(num_samples, len(split_data))))\n",
	" for idx, example in enumerate(sample_data):\n",
	" print(f\"Sample {idx+1}:\")\n",
	" decoded_text = decode_tokens(tokenizer, example['input_ids'])\n",
	" print(f\"Decoded Text: {decoded_text}\")\n",
	" print({key: val for key, val in example.items() if key in ['input_ids', 'attention_mask', 'labels']})\n",
	" print(\"\\n\")\n",
	"\n",
	"def check_dataset_shapes(dataset):\n",
	" for example in dataset:\n",
	" input_shape = example['input_ids'].shape\n",
	" label_shape = example['labels'].shape\n",
	" if input_shape != label_shape:\n",
	" print(f\"Mismatch found! Input shape: {input_shape}, Label shape: {label_shape}\")\n",
	"\n",
	"\n",
	"# Configuration settings\n",
	"dataset_path = \"lamini/lamini_docs\"\n",
	"use_hf = True\n",
	"training_config = {\n",
	" \"model\": {\n",
	" \"pretrained_name\": base_model_name,\n",
	" \"max_length\" : 2048\n",
	" },\n",
	" \"datasets\": {\n",
	" \"use_hf\": use_hf,\n",
	" \"path\": dataset_path\n",
	" },\n",
	" \"verbose\": True\n",
	"}\n",
	"# Load tokenizer\n",
	"tokenizer = AutoTokenizer.from_pretrained(training_config['model']['pretrained_name'])\n",
	"tokenizer.pad_token = tokenizer.eos_token\n",
	"# Load, tokenize, and split dataset\n",
	"training_datasets = load_and_tokenize_data(tokenizer, training_config)\n",
	"train_dataset, test_dataset, validation_dataset = training_datasets.get('train', None), training_datasets.get('test', None), training_datasets.get('validation', None)\n",
	"print(\"Datasets are loaded and tokenized.\")\n",
	"\n",
	"\n"
	],
	"metadata": {
	"id": "iOjZcNT9xx__"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"Check sample data/ review dataset information:"
	],
	"metadata": {
	"id": "PrRGfa2bPJRg"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"check_dataset_shapes(train_dataset)\n",
	"display_dataset_info(training_datasets)\n",
	"display_sample_data(training_datasets, tokenizer, num_samples=3)"
	],
	"metadata": {
	"id": "iStUR87yO9Gp"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"Train the base model:"
	],
	"metadata": {
	"id": "0depRbjrTNVm"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"def train(model, max_steps, output_dir):\n",
	" # Setup training arguments\n",
	" training_args = TrainingArguments(\n",
	" learning_rate=1.0e-5,\n",
	" num_train_epochs=1,\n",
	" max_steps=max_steps,\n",
	" per_device_train_batch_size=1,\n",
	" output_dir=output_dir,\n",
	" overwrite_output_dir=True,\n",
	" disable_tqdm=False,\n",
	" eval_steps=10,\n",
	" save_steps=50,\n",
	" warmup_steps=1,\n",
	" per_device_eval_batch_size=1,\n",
	" evaluation_strategy=\"steps\",\n",
	" logging_strategy=\"steps\",\n",
	" logging_steps=1,\n",
	" optim=\"adafactor\",\n",
	" gradient_accumulation_steps=4,\n",
	" gradient_checkpointing=False,\n",
	" load_best_model_at_end=True,\n",
	" save_total_limit=1,\n",
	" metric_for_best_model=\"eval_loss\",\n",
	" greater_is_better=False\n",
	" )\n",
	"\n",
	" # Could use SFTrainer from trl, requires rework of dataset and trainer args\n",
	" # See: https://huggingface.co/docs/trl/sft_trainer\n",
	" trainer = Trainer(\n",
	" model=model,\n",
	" args=training_args,\n",
	" train_dataset=train_dataset,\n",
	" eval_dataset=test_dataset\n",
	" )\n",
	"\n",
	" # Start training\n",
	" training_output = trainer.train()\n",
	" save_dir = f'{output_dir}/final'\n",
	" trainer.save_model(save_dir)\n",
	" tokenizer.save_pretrained(save_dir)\n",
	" print(\"Saved model to:\", save_dir)\n",
	"\n",
	"# Setup\n",
	"model = AutoModelForCausalLM.from_pretrained(base_model_name)\n",
	"max_steps = 100\n",
	"output_dir = f\"{base_model_name}_lamini_{max_steps}_steps\"\n",
	"# Train\n",
	"train(model, max_steps, output_dir)"
	],
	"metadata": {
	"id": "scdR2kTUTPrw"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "b326BR8a_Enw"
	},
	"source": [
	"Compare instruction output against the fine-tuned model:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "bTP3iEVVYliA"
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"# Helper function to enable model testing\n",
	"def test_finetuned_model(model_path, tokenizer, args):\n",
	" \"\"\"\n",
	" Loads a fine-tuned model and generates text based on the input prompt.\n",
	" \"\"\"\n",
	" # Load the fine-tuned model\n",
	" model = AutoModelForCausalLM.from_pretrained(model_path)\n",
	" # Set up a text-generation pipeline\n",
	" pipe = pipeline(\n",
	" \"text-generation\",\n",
	" model=model,\n",
	" tokenizer=tokenizer,\n",
	" )\n",
	" # Generate text based on a prompt\n",
	" prompt = \"Does Lamini support generating code?\"\n",
	" output = pipe(prompt, **args)\n",
	" return output[0]['generated_text']\n",
	"\n",
	"# Setup\n",
	"trained_model_path = f'{output_dir}/final'\n",
	"generation_args = {\n",
	" \"max_length\": 50,\n",
	" \"truncation\": True,\n",
	"}\n",
	"# Test Fine-Tuned Model\n",
	"tokenizer = AutoTokenizer.from_pretrained(trained_model_path)\n",
	"fine_tuned_output = test_finetuned_model(trained_model_path, tokenizer, generation_args)"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"print(\"Fine-tuned model output:\", fine_tuned_output)"
	],
	"metadata": {
	"id": "OF_OSsHbd9oy"
	},
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"gpuType": "T4",
	"provenance": [],
	"authorship_tag": "ABX9TyPg0akpeqBhjPRclaWNkyat",
	"include_colab_link": true
	},
	"kernelspec": {
	"display_name": "Python 3",
	"name": "python3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}