Last active
January 10, 2024 14:05
-
-
Save vic4key/062cbba779e64f60e6cd8c9dbdf49029 to your computer and use it in GitHub Desktop.
openai-fine-tuning-estimate-cost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Estimated amount of the cost for fine-tuning GPT models | |
# Common | |
''' requirements.txt | |
httpx | |
numpy | |
openai | |
tiktoken | |
urllib3 | |
''' | |
import json | |
import tiktoken # for token counting | |
import numpy as np | |
from collections import defaultdict | |
def print_header(title: str): print("\n", f" [{title}] ".center(80, "*"), "\n") | |
# Arguments | |
ft_model_name, ft_model_price, ft_model_max_token_limit = ("gpt-3.5-turbo", 0.0080, 4096) # https://openai.com/pricing#fine-tuning-models | |
ft_training_file = "gpt_3_5_turbo_finetuning.jsonl" | |
ft_num_epochs = 3 | |
# Data loading | |
print_header("Data Loading") | |
data_path = RF"data/{ft_training_file}" | |
# Load the dataset | |
with open(data_path, "r", encoding="utf-8") as f: | |
dataset = [json.loads(line) for line in f] | |
# Initial dataset stats | |
print("Num examples:", len(dataset)) | |
print("First example:") | |
for message in dataset[0]["messages"]: | |
print(message) | |
# Format validation | |
print_header("Format Validation") | |
# Format error checks | |
format_errors = defaultdict(int) | |
for ex in dataset: | |
if not isinstance(ex, dict): | |
format_errors["data_type"] += 1 | |
continue | |
messages = ex.get("messages", None) | |
if not messages: | |
format_errors["missing_messages_list"] += 1 | |
continue | |
for message in messages: | |
if "role" not in message or "content" not in message: | |
format_errors["message_missing_key"] += 1 | |
if any(k not in ("role", "content", "name", "function_call") for k in message): | |
format_errors["message_unrecognized_key"] += 1 | |
if message.get("role", None) not in ("system", "user", "assistant", "function"): | |
format_errors["unrecognized_role"] += 1 | |
content = message.get("content", None) | |
function_call = message.get("function_call", None) | |
if (not content and not function_call) or not isinstance(content, str): | |
format_errors["missing_content"] += 1 | |
if not any(message.get("role", None) == "assistant" for message in messages): | |
format_errors["example_missing_assistant_message"] += 1 | |
if format_errors: | |
print("Found errors:") | |
for k, v in format_errors.items(): | |
print(f"{k}: {v}") | |
else: | |
print("No errors found") | |
# Token Counting Utilities | |
# print_header("Token Counting Utilities") | |
encoding = tiktoken.get_encoding("cl100k_base") | |
# not exact! | |
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb | |
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1): | |
num_tokens = 0 | |
for message in messages: | |
num_tokens += tokens_per_message | |
for key, value in message.items(): | |
num_tokens += len(encoding.encode(value)) | |
if key == "name": | |
num_tokens += tokens_per_name | |
num_tokens += 3 | |
return num_tokens | |
def num_assistant_tokens_from_messages(messages): | |
num_tokens = 0 | |
for message in messages: | |
if message["role"] == "assistant": | |
num_tokens += len(encoding.encode(message["content"])) | |
return num_tokens | |
def print_distribution(values, name): | |
print(f"\n#### Distribution of {name}:") | |
print(f"min / max: {min(values)}, {max(values)}") | |
print(f"mean / median: {np.mean(values)}, {np.median(values)}") | |
print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}") | |
# Data Warnings and Token Counts | |
print_header("Data Warnings and Token Counts") | |
# Warnings and tokens counts | |
n_missing_system = 0 | |
n_missing_user = 0 | |
n_messages = [] | |
convo_lens = [] | |
assistant_message_lens = [] | |
for ex in dataset: | |
messages = ex["messages"] | |
if not any(message["role"] == "system" for message in messages): | |
n_missing_system += 1 | |
if not any(message["role"] == "user" for message in messages): | |
n_missing_user += 1 | |
n_messages.append(len(messages)) | |
convo_lens.append(num_tokens_from_messages(messages)) | |
assistant_message_lens.append(num_assistant_tokens_from_messages(messages)) | |
print("Num examples missing system message:", n_missing_system) | |
print("Num examples missing user message:", n_missing_user) | |
print_distribution(n_messages, "num_messages_per_example") | |
print_distribution(convo_lens, "num_total_tokens_per_example") | |
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example") | |
n_too_long = sum(l > ft_model_max_token_limit for l in convo_lens) | |
print(f"\n{n_too_long} examples may be over the {ft_model_max_token_limit} token limit, they will be truncated during fine-tuning") | |
# Cost Estimation | |
print_header("Cost Estimation") | |
# Pricing and default n_epochs estimate | |
MAX_TOKENS_PER_EXAMPLE = ft_model_max_token_limit | |
MIN_TARGET_EXAMPLES = 1 # 100 # TODO: Why 100? | |
MAX_TARGET_EXAMPLES = 10**6 # 25000 # TODO: Why 25000? | |
MIN_DEFAULT_EPOCHS = 1 | |
MAX_DEFAULT_EPOCHS = 25 # TODO: Why 25? | |
TARGET_EPOCHS = ft_num_epochs | |
n_epochs = TARGET_EPOCHS | |
n_train_examples = len(dataset) | |
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES: | |
n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples) | |
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES: | |
n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples) | |
n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens) | |
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") | |
print(f"By default, you'll train for {n_epochs} epochs on this dataset") | |
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") | |
print() | |
print(f"Fine-tunine the model '{ft_model_name}' with the training file '{ft_training_file}' ~{n_billing_tokens_in_dataset} tokens by {n_epochs} epochs.") | |
print(f"Estimated amount of the cost ≈ ${n_epochs * n_billing_tokens_in_dataset * ft_model_price} USD.") | |
''' | |
(openai-env) openai-env>python fine-tuning-estimate-cost.py | |
******************************** [Data Loading] ******************************** | |
Num examples: 3 | |
First example: | |
{'role': 'system', 'content': 'You are a helpful assistant.'} | |
{'role': 'user', 'content': '...'} | |
{'role': 'assistant', 'content': '...'} | |
***************************** [Format Validation] ****************************** | |
No errors found | |
*********************** [Data Warnings and Token Counts] *********************** | |
Num examples missing system message: 0 | |
Num examples missing user message: 0 | |
#### Distribution of num_messages_per_example: | |
min / max: 3, 3 | |
mean / median: 3.0, 3.0 | |
p5 / p95: 3.0, 3.0 | |
#### Distribution of num_total_tokens_per_example: | |
min / max: 35, 42 | |
mean / median: 39.333333333333336, 41.0 | |
p5 / p95: 36.2, 41.8 | |
#### Distribution of num_assistant_tokens_per_example: | |
min / max: 9, 15 | |
mean / median: 12.666666666666666, 14.0 | |
p5 / p95: 10.0, 14.8 | |
0 examples may be over the 4096 token limit, they will be truncated during fine-tuning | |
****************************** [Cost Estimation] ******************************* | |
Dataset has ~118 tokens that will be charged for during training | |
By default, you'll train for 3 epochs on this dataset | |
By default, you'll be charged for ~354 tokens | |
Fine-tunine the model 'gpt-3.5-turbo' with the training file 'gpt_3_5_turbo_finetuning.jsonl' ~118 tokens by 3 epochs. | |
Estimated amount of the cost ≈ $2.832 USD. | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment