Last active
May 9, 2024 10:48
-
-
Save coldfire84/56d9f426d71cc6c6422b25309293eba7 to your computer and use it in GitHub Desktop.
base-vs-instruction-fine-tuned.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/coldfire84/56d9f426d71cc6c6422b25309293eba7/base-vs-instruction-fine-tuned.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "GZnoKg_OA07d" | |
}, | |
"source": [ | |
"# Base Model vs Instruction-Tuned Comparison" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"A larger-scale fine-tuning example, using Llama3, can be seen here: https://colab.research.google.com/drive/1BqXRVQlGvdR5DJvvos8xdAL_y_eLgkSd?usp=drive_open" | |
], | |
"metadata": { | |
"id": "I4_gaM6vlDj4" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "W4IGMoUS9G9c" | |
}, | |
"source": [ | |
"Install pre-requisites and login tio HuggingFace:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "jHhWq4yvmrhT" | |
}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"# Uncomment if using with GPU backend/ supported model\n", | |
"!pip install -U transformers accelerate\n", | |
"!pip install -U datasets" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3xwda9g-o6BJ" | |
}, | |
"source": [ | |
"Prepare environment: imports and configuration:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "HMMUjp1b-40s" | |
}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"import torch\n", | |
"import datasets\n", | |
"import random\n", | |
"from transformers import pipeline, set_seed, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification\n", | |
"# Login to HF\n", | |
"from huggingface_hub import login\n", | |
"from google.colab import userdata\n", | |
"token = userdata.get('HF_TOKEN')\n", | |
"login(token)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "b3fbrr4R_D9i" | |
}, | |
"source": [ | |
"Prepare base `EleutherAI/pythia-70m` and stage initial instruction:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "W-3Ksdn7_Bfr" | |
}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"# Helper function to enable model testing\n", | |
"def testbasemodel(model_name, args):\n", | |
" model = AutoModelForCausalLM.from_pretrained(\n", | |
" model_name,\n", | |
" # On a Model and Environment that has a GPU...\n", | |
" #device_map=\"auto\",\n", | |
" #torch_dtype=\"auto\",\n", | |
" #trust_remote_code=True,\n", | |
" )\n", | |
" # Get appropriate tokenizer automatically, using HF AutoTokenizer\n", | |
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n", | |
" pipe = pipeline(\n", | |
" \"text-generation\",\n", | |
" model=model,\n", | |
" tokenizer=tokenizer,\n", | |
" )\n", | |
" output = pipe(\"Does Lamini support generating code?\", **args)\n", | |
" return output[0]['generated_text']\n", | |
"# Base Model\n", | |
"base_model_name=\"EleutherAI/pythia-70m\"\n", | |
"base_generation_args = {\n", | |
" \"max_length\": 50,\n", | |
" \"truncation\": True,\n", | |
"}\n", | |
"base_output = testbasemodel(base_model_name, base_generation_args)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Review base model output:" | |
], | |
"metadata": { | |
"id": "_2zZH6dfTmQj" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "cRyGpzktYoR7" | |
}, | |
"outputs": [], | |
"source": [ | |
"print(f'Base model output ({base_model_name}): {base_output} ')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Prepare training dataset:" | |
], | |
"metadata": { | |
"id": "RdrTbR_wHpCR" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def get_tokenize_function(tokenizer, max_length):\n", | |
" \"\"\"\n", | |
" Returns a function to tokenize text data using provided tokenizer with specified maximum length.\n", | |
" This assumes NO prompt template requirement, you may need to adjust accordingly/ needed by your base model.\n", | |
" \"\"\"\n", | |
" def tokenize_function(examples):\n", | |
" # Determine which key(s) are present and should be concatenated\n", | |
" keys = ['question', 'answer', 'input', 'output', 'text']\n", | |
" texts = []\n", | |
" for idx in range(len(examples[next(iter(examples))])): # Loop through the batch\n", | |
" text = ''\n", | |
" for key in keys:\n", | |
" if key in examples:\n", | |
" text += examples[key][idx]\n", | |
" texts.append(text)\n", | |
" # Tokenize all texts in the batch with padding and truncation\n", | |
" tokenized_inputs = tokenizer(\n", | |
" texts,\n", | |
" max_length=max_length,\n", | |
" padding='max_length',\n", | |
" truncation=True,\n", | |
" return_tensors='np'\n", | |
" )\n", | |
" tokenized_inputs[\"labels\"] = tokenized_inputs[\"input_ids\"]\n", | |
" return tokenized_inputs\n", | |
"\n", | |
" return tokenize_function\n", | |
"\n", | |
"def load_and_tokenize_data(tokenizer, config):\n", | |
" random.seed(42) # Set a seed for reproducibility\n", | |
" # Load the dataset\n", | |
" if config[\"datasets\"][\"use_hf\"]:\n", | |
" dataset = datasets.load_dataset(config[\"datasets\"][\"path\"])\n", | |
" else:\n", | |
" # Load from local JSON file with a specified split if not using Hugging Face datasets\n", | |
" dataset = datasets.load_dataset(\"json\", data_files=config[\"datasets\"][\"path\"], split='train')\n", | |
" # Process each split available in the dataset\n", | |
" tokenize_func = get_tokenize_function(tokenizer, config[\"model\"]['max_length'])\n", | |
" for split in dataset.keys():\n", | |
" dataset[split] = dataset[split].map(tokenize_func, batched=True)\n", | |
" dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n", | |
" return dataset\n", | |
"\n", | |
"def decode_tokens(tokenizer, input_ids):\n", | |
" \"\"\"\n", | |
" Decodes token IDs to readable text using the provided tokenizer.\n", | |
" \"\"\"\n", | |
" return tokenizer.decode(input_ids, skip_special_tokens=True)\n", | |
"\n", | |
"def display_dataset_info(datasets):\n", | |
" for split_name, split_data in datasets.items():\n", | |
" print(f\"--- {split_name.upper()} SPLIT ---\")\n", | |
" print(f\"Number of examples: {len(split_data)}\")\n", | |
" print(\"Columns:\", split_data.column_names)\n", | |
" print(\"Features:\", split_data.features)\n", | |
" print(\"\\n\")\n", | |
"\n", | |
"def display_sample_data(datasets, tokenizer, num_samples=5):\n", | |
" for split_name, split_data in datasets.items():\n", | |
" print(f\"--- {split_name.upper()} SPLIT SAMPLES ---\")\n", | |
" sample_data = split_data.shuffle(seed=42).select(range(min(num_samples, len(split_data))))\n", | |
" for idx, example in enumerate(sample_data):\n", | |
" print(f\"Sample {idx+1}:\")\n", | |
" decoded_text = decode_tokens(tokenizer, example['input_ids'])\n", | |
" print(f\"Decoded Text: {decoded_text}\")\n", | |
" print({key: val for key, val in example.items() if key in ['input_ids', 'attention_mask', 'labels']})\n", | |
" print(\"\\n\")\n", | |
"\n", | |
"def check_dataset_shapes(dataset):\n", | |
" for example in dataset:\n", | |
" input_shape = example['input_ids'].shape\n", | |
" label_shape = example['labels'].shape\n", | |
" if input_shape != label_shape:\n", | |
" print(f\"Mismatch found! Input shape: {input_shape}, Label shape: {label_shape}\")\n", | |
"\n", | |
"\n", | |
"# Configuration settings\n", | |
"dataset_path = \"lamini/lamini_docs\"\n", | |
"use_hf = True\n", | |
"training_config = {\n", | |
" \"model\": {\n", | |
" \"pretrained_name\": base_model_name,\n", | |
" \"max_length\" : 2048\n", | |
" },\n", | |
" \"datasets\": {\n", | |
" \"use_hf\": use_hf,\n", | |
" \"path\": dataset_path\n", | |
" },\n", | |
" \"verbose\": True\n", | |
"}\n", | |
"# Load tokenizer\n", | |
"tokenizer = AutoTokenizer.from_pretrained(training_config['model']['pretrained_name'])\n", | |
"tokenizer.pad_token = tokenizer.eos_token\n", | |
"# Load, tokenize, and split dataset\n", | |
"training_datasets = load_and_tokenize_data(tokenizer, training_config)\n", | |
"train_dataset, test_dataset, validation_dataset = training_datasets.get('train', None), training_datasets.get('test', None), training_datasets.get('validation', None)\n", | |
"print(\"Datasets are loaded and tokenized.\")\n", | |
"\n", | |
"\n" | |
], | |
"metadata": { | |
"id": "iOjZcNT9xx__" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Check sample data/ review dataset information:" | |
], | |
"metadata": { | |
"id": "PrRGfa2bPJRg" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"check_dataset_shapes(train_dataset)\n", | |
"display_dataset_info(training_datasets)\n", | |
"display_sample_data(training_datasets, tokenizer, num_samples=3)" | |
], | |
"metadata": { | |
"id": "iStUR87yO9Gp" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Train the base model:" | |
], | |
"metadata": { | |
"id": "0depRbjrTNVm" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def train(model, max_steps, output_dir):\n", | |
" # Setup training arguments\n", | |
" training_args = TrainingArguments(\n", | |
" learning_rate=1.0e-5,\n", | |
" num_train_epochs=1,\n", | |
" max_steps=max_steps,\n", | |
" per_device_train_batch_size=1,\n", | |
" output_dir=output_dir,\n", | |
" overwrite_output_dir=True,\n", | |
" disable_tqdm=False,\n", | |
" eval_steps=10,\n", | |
" save_steps=50,\n", | |
" warmup_steps=1,\n", | |
" per_device_eval_batch_size=1,\n", | |
" evaluation_strategy=\"steps\",\n", | |
" logging_strategy=\"steps\",\n", | |
" logging_steps=1,\n", | |
" optim=\"adafactor\",\n", | |
" gradient_accumulation_steps=4,\n", | |
" gradient_checkpointing=False,\n", | |
" load_best_model_at_end=True,\n", | |
" save_total_limit=1,\n", | |
" metric_for_best_model=\"eval_loss\",\n", | |
" greater_is_better=False\n", | |
" )\n", | |
"\n", | |
" # Could use SFTrainer from trl, requires rework of dataset and trainer args\n", | |
" # See: https://huggingface.co/docs/trl/sft_trainer\n", | |
" trainer = Trainer(\n", | |
" model=model,\n", | |
" args=training_args,\n", | |
" train_dataset=train_dataset,\n", | |
" eval_dataset=test_dataset\n", | |
" )\n", | |
"\n", | |
" # Start training\n", | |
" training_output = trainer.train()\n", | |
" save_dir = f'{output_dir}/final'\n", | |
" trainer.save_model(save_dir)\n", | |
" tokenizer.save_pretrained(save_dir)\n", | |
" print(\"Saved model to:\", save_dir)\n", | |
"\n", | |
"# Setup\n", | |
"model = AutoModelForCausalLM.from_pretrained(base_model_name)\n", | |
"max_steps = 100\n", | |
"output_dir = f\"{base_model_name}_lamini_{max_steps}_steps\"\n", | |
"# Train\n", | |
"train(model, max_steps, output_dir)" | |
], | |
"metadata": { | |
"id": "scdR2kTUTPrw" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "b326BR8a_Enw" | |
}, | |
"source": [ | |
"Compare instruction output against the fine-tuned model:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "bTP3iEVVYliA" | |
}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"# Helper function to enable model testing\n", | |
"def test_finetuned_model(model_path, tokenizer, args):\n", | |
" \"\"\"\n", | |
" Loads a fine-tuned model and generates text based on the input prompt.\n", | |
" \"\"\"\n", | |
" # Load the fine-tuned model\n", | |
" model = AutoModelForCausalLM.from_pretrained(model_path)\n", | |
" # Set up a text-generation pipeline\n", | |
" pipe = pipeline(\n", | |
" \"text-generation\",\n", | |
" model=model,\n", | |
" tokenizer=tokenizer,\n", | |
" )\n", | |
" # Generate text based on a prompt\n", | |
" prompt = \"Does Lamini support generating code?\"\n", | |
" output = pipe(prompt, **args)\n", | |
" return output[0]['generated_text']\n", | |
"\n", | |
"# Setup\n", | |
"trained_model_path = f'{output_dir}/final'\n", | |
"generation_args = {\n", | |
" \"max_length\": 50,\n", | |
" \"truncation\": True,\n", | |
"}\n", | |
"# Test Fine-Tuned Model\n", | |
"tokenizer = AutoTokenizer.from_pretrained(trained_model_path)\n", | |
"fine_tuned_output = test_finetuned_model(trained_model_path, tokenizer, generation_args)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(\"Fine-tuned model output:\", fine_tuned_output)" | |
], | |
"metadata": { | |
"id": "OF_OSsHbd9oy" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"accelerator": "GPU", | |
"colab": { | |
"gpuType": "T4", | |
"provenance": [], | |
"authorship_tag": "ABX9TyPg0akpeqBhjPRclaWNkyat", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment