Skip to content

Instantly share code, notes, and snippets.

@coldfire84
Last active May 9, 2024 10:48
Show Gist options
  • Save coldfire84/56d9f426d71cc6c6422b25309293eba7 to your computer and use it in GitHub Desktop.
Save coldfire84/56d9f426d71cc6c6422b25309293eba7 to your computer and use it in GitHub Desktop.
base-vs-instruction-fine-tuned.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/coldfire84/56d9f426d71cc6c6422b25309293eba7/base-vs-instruction-fine-tuned.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GZnoKg_OA07d"
},
"source": [
"# Base Model vs Instruction-Tuned Comparison"
]
},
{
"cell_type": "markdown",
"source": [
"A larger-scale fine-tuning example, using Llama3, can be seen here: https://colab.research.google.com/drive/1BqXRVQlGvdR5DJvvos8xdAL_y_eLgkSd?usp=drive_open"
],
"metadata": {
"id": "I4_gaM6vlDj4"
}
},
{
"cell_type": "markdown",
"metadata": {
"id": "W4IGMoUS9G9c"
},
"source": [
"Install pre-requisites and login tio HuggingFace:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jHhWq4yvmrhT"
},
"outputs": [],
"source": [
"%%capture\n",
"# Uncomment if using with GPU backend/ supported model\n",
"!pip install -U transformers accelerate\n",
"!pip install -U datasets"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3xwda9g-o6BJ"
},
"source": [
"Prepare environment: imports and configuration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HMMUjp1b-40s"
},
"outputs": [],
"source": [
"%%capture\n",
"import torch\n",
"import datasets\n",
"import random\n",
"from transformers import pipeline, set_seed, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification\n",
"# Login to HF\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"token = userdata.get('HF_TOKEN')\n",
"login(token)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "b3fbrr4R_D9i"
},
"source": [
"Prepare base `EleutherAI/pythia-70m` and stage initial instruction:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "W-3Ksdn7_Bfr"
},
"outputs": [],
"source": [
"%%capture\n",
"# Helper function to enable model testing\n",
"def testbasemodel(model_name, args):\n",
" model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" # On a Model and Environment that has a GPU...\n",
" #device_map=\"auto\",\n",
" #torch_dtype=\"auto\",\n",
" #trust_remote_code=True,\n",
" )\n",
" # Get appropriate tokenizer automatically, using HF AutoTokenizer\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
" pipe = pipeline(\n",
" \"text-generation\",\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" )\n",
" output = pipe(\"Does Lamini support generating code?\", **args)\n",
" return output[0]['generated_text']\n",
"# Base Model\n",
"base_model_name=\"EleutherAI/pythia-70m\"\n",
"base_generation_args = {\n",
" \"max_length\": 50,\n",
" \"truncation\": True,\n",
"}\n",
"base_output = testbasemodel(base_model_name, base_generation_args)"
]
},
{
"cell_type": "markdown",
"source": [
"Review base model output:"
],
"metadata": {
"id": "_2zZH6dfTmQj"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cRyGpzktYoR7"
},
"outputs": [],
"source": [
"print(f'Base model output ({base_model_name}): {base_output} ')"
]
},
{
"cell_type": "markdown",
"source": [
"Prepare training dataset:"
],
"metadata": {
"id": "RdrTbR_wHpCR"
}
},
{
"cell_type": "code",
"source": [
"def get_tokenize_function(tokenizer, max_length):\n",
" \"\"\"\n",
" Returns a function to tokenize text data using provided tokenizer with specified maximum length.\n",
" This assumes NO prompt template requirement, you may need to adjust accordingly/ needed by your base model.\n",
" \"\"\"\n",
" def tokenize_function(examples):\n",
" # Determine which key(s) are present and should be concatenated\n",
" keys = ['question', 'answer', 'input', 'output', 'text']\n",
" texts = []\n",
" for idx in range(len(examples[next(iter(examples))])): # Loop through the batch\n",
" text = ''\n",
" for key in keys:\n",
" if key in examples:\n",
" text += examples[key][idx]\n",
" texts.append(text)\n",
" # Tokenize all texts in the batch with padding and truncation\n",
" tokenized_inputs = tokenizer(\n",
" texts,\n",
" max_length=max_length,\n",
" padding='max_length',\n",
" truncation=True,\n",
" return_tensors='np'\n",
" )\n",
" tokenized_inputs[\"labels\"] = tokenized_inputs[\"input_ids\"]\n",
" return tokenized_inputs\n",
"\n",
" return tokenize_function\n",
"\n",
"def load_and_tokenize_data(tokenizer, config):\n",
" random.seed(42) # Set a seed for reproducibility\n",
" # Load the dataset\n",
" if config[\"datasets\"][\"use_hf\"]:\n",
" dataset = datasets.load_dataset(config[\"datasets\"][\"path\"])\n",
" else:\n",
" # Load from local JSON file with a specified split if not using Hugging Face datasets\n",
" dataset = datasets.load_dataset(\"json\", data_files=config[\"datasets\"][\"path\"], split='train')\n",
" # Process each split available in the dataset\n",
" tokenize_func = get_tokenize_function(tokenizer, config[\"model\"]['max_length'])\n",
" for split in dataset.keys():\n",
" dataset[split] = dataset[split].map(tokenize_func, batched=True)\n",
" dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
" return dataset\n",
"\n",
"def decode_tokens(tokenizer, input_ids):\n",
" \"\"\"\n",
" Decodes token IDs to readable text using the provided tokenizer.\n",
" \"\"\"\n",
" return tokenizer.decode(input_ids, skip_special_tokens=True)\n",
"\n",
"def display_dataset_info(datasets):\n",
" for split_name, split_data in datasets.items():\n",
" print(f\"--- {split_name.upper()} SPLIT ---\")\n",
" print(f\"Number of examples: {len(split_data)}\")\n",
" print(\"Columns:\", split_data.column_names)\n",
" print(\"Features:\", split_data.features)\n",
" print(\"\\n\")\n",
"\n",
"def display_sample_data(datasets, tokenizer, num_samples=5):\n",
" for split_name, split_data in datasets.items():\n",
" print(f\"--- {split_name.upper()} SPLIT SAMPLES ---\")\n",
" sample_data = split_data.shuffle(seed=42).select(range(min(num_samples, len(split_data))))\n",
" for idx, example in enumerate(sample_data):\n",
" print(f\"Sample {idx+1}:\")\n",
" decoded_text = decode_tokens(tokenizer, example['input_ids'])\n",
" print(f\"Decoded Text: {decoded_text}\")\n",
" print({key: val for key, val in example.items() if key in ['input_ids', 'attention_mask', 'labels']})\n",
" print(\"\\n\")\n",
"\n",
"def check_dataset_shapes(dataset):\n",
" for example in dataset:\n",
" input_shape = example['input_ids'].shape\n",
" label_shape = example['labels'].shape\n",
" if input_shape != label_shape:\n",
" print(f\"Mismatch found! Input shape: {input_shape}, Label shape: {label_shape}\")\n",
"\n",
"\n",
"# Configuration settings\n",
"dataset_path = \"lamini/lamini_docs\"\n",
"use_hf = True\n",
"training_config = {\n",
" \"model\": {\n",
" \"pretrained_name\": base_model_name,\n",
" \"max_length\" : 2048\n",
" },\n",
" \"datasets\": {\n",
" \"use_hf\": use_hf,\n",
" \"path\": dataset_path\n",
" },\n",
" \"verbose\": True\n",
"}\n",
"# Load tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(training_config['model']['pretrained_name'])\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"# Load, tokenize, and split dataset\n",
"training_datasets = load_and_tokenize_data(tokenizer, training_config)\n",
"train_dataset, test_dataset, validation_dataset = training_datasets.get('train', None), training_datasets.get('test', None), training_datasets.get('validation', None)\n",
"print(\"Datasets are loaded and tokenized.\")\n",
"\n",
"\n"
],
"metadata": {
"id": "iOjZcNT9xx__"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Check sample data/ review dataset information:"
],
"metadata": {
"id": "PrRGfa2bPJRg"
}
},
{
"cell_type": "code",
"source": [
"check_dataset_shapes(train_dataset)\n",
"display_dataset_info(training_datasets)\n",
"display_sample_data(training_datasets, tokenizer, num_samples=3)"
],
"metadata": {
"id": "iStUR87yO9Gp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Train the base model:"
],
"metadata": {
"id": "0depRbjrTNVm"
}
},
{
"cell_type": "code",
"source": [
"def train(model, max_steps, output_dir):\n",
" # Setup training arguments\n",
" training_args = TrainingArguments(\n",
" learning_rate=1.0e-5,\n",
" num_train_epochs=1,\n",
" max_steps=max_steps,\n",
" per_device_train_batch_size=1,\n",
" output_dir=output_dir,\n",
" overwrite_output_dir=True,\n",
" disable_tqdm=False,\n",
" eval_steps=10,\n",
" save_steps=50,\n",
" warmup_steps=1,\n",
" per_device_eval_batch_size=1,\n",
" evaluation_strategy=\"steps\",\n",
" logging_strategy=\"steps\",\n",
" logging_steps=1,\n",
" optim=\"adafactor\",\n",
" gradient_accumulation_steps=4,\n",
" gradient_checkpointing=False,\n",
" load_best_model_at_end=True,\n",
" save_total_limit=1,\n",
" metric_for_best_model=\"eval_loss\",\n",
" greater_is_better=False\n",
" )\n",
"\n",
" # Could use SFTrainer from trl, requires rework of dataset and trainer args\n",
" # See: https://huggingface.co/docs/trl/sft_trainer\n",
" trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset\n",
" )\n",
"\n",
" # Start training\n",
" training_output = trainer.train()\n",
" save_dir = f'{output_dir}/final'\n",
" trainer.save_model(save_dir)\n",
" tokenizer.save_pretrained(save_dir)\n",
" print(\"Saved model to:\", save_dir)\n",
"\n",
"# Setup\n",
"model = AutoModelForCausalLM.from_pretrained(base_model_name)\n",
"max_steps = 100\n",
"output_dir = f\"{base_model_name}_lamini_{max_steps}_steps\"\n",
"# Train\n",
"train(model, max_steps, output_dir)"
],
"metadata": {
"id": "scdR2kTUTPrw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "b326BR8a_Enw"
},
"source": [
"Compare instruction output against the fine-tuned model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bTP3iEVVYliA"
},
"outputs": [],
"source": [
"%%capture\n",
"# Helper function to enable model testing\n",
"def test_finetuned_model(model_path, tokenizer, args):\n",
" \"\"\"\n",
" Loads a fine-tuned model and generates text based on the input prompt.\n",
" \"\"\"\n",
" # Load the fine-tuned model\n",
" model = AutoModelForCausalLM.from_pretrained(model_path)\n",
" # Set up a text-generation pipeline\n",
" pipe = pipeline(\n",
" \"text-generation\",\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" )\n",
" # Generate text based on a prompt\n",
" prompt = \"Does Lamini support generating code?\"\n",
" output = pipe(prompt, **args)\n",
" return output[0]['generated_text']\n",
"\n",
"# Setup\n",
"trained_model_path = f'{output_dir}/final'\n",
"generation_args = {\n",
" \"max_length\": 50,\n",
" \"truncation\": True,\n",
"}\n",
"# Test Fine-Tuned Model\n",
"tokenizer = AutoTokenizer.from_pretrained(trained_model_path)\n",
"fine_tuned_output = test_finetuned_model(trained_model_path, tokenizer, generation_args)"
]
},
{
"cell_type": "code",
"source": [
"print(\"Fine-tuned model output:\", fine_tuned_output)"
],
"metadata": {
"id": "OF_OSsHbd9oy"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": [],
"authorship_tag": "ABX9TyPg0akpeqBhjPRclaWNkyat",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment