Last active
July 14, 2024 15:59
-
-
Save satpalsr/690cd7ca480964a729f1b320b8b90110 to your computer and use it in GitHub Desktop.
Faster version of https://github.com/togethercomputer/finetuning/blob/main/3-eval.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PART 3 – an evaluation script to test the accuracy of the fine-tuned model vs the base model. | |
# We will use Dataformer for making API requests to Together AI. It respects rate-limits & supports cache - providing speed & saving money. | |
# pip install dataformer@git+https://github.com/DataformerAI/dataformer.git | |
import json | |
from dataformer.llms import AsyncLLM | |
from dotenv import load_dotenv | |
load_dotenv() | |
import os | |
import time | |
start_time = time.time() | |
api_key=os.environ.get("TOGETHER_API_KEY") | |
max_requests_per_minute = 300 | |
base_model = "meta-llama/Llama-3-8b-chat-hf" | |
top_oss_model = "meta-llama/Llama-3-70b-chat-hf" | |
finetuned_model = "YOUR_FINETUNED_MODEL_ID" | |
llm = AsyncLLM(api_provider="together", max_requests_per_minute=max_requests_per_minute, api_key=api_key) | |
evaluator_model = "meta-llama/Llama-3-70b-chat-hf" | |
eval_dataset = "EvalDataset-100.json" | |
# 1. Get all responses for the eval dataset | |
with open(eval_dataset, "r", encoding="utf-8") as eval_file: | |
eval_data = json.load(eval_file) | |
results = [] | |
def initial_request_list(model, eval_data): | |
return [ | |
{"model": model, "messages": [{"role": "user", "content": example["instruction"]}], "max_tokens": 1500} for example in eval_data | |
] | |
request_list = initial_request_list(base_model, eval_data) + initial_request_list(top_oss_model, eval_data) + initial_request_list(finetuned_model, eval_data) | |
completions = llm.generate(request_list) | |
# Initialize empty lists for each model's completions | |
baseModelCompletions = [] | |
topOSSModelCompletions = [] | |
finetunedModelCompletions = [] | |
# Iterate over completions and categorize them based on the model name | |
for completion in completions: | |
model_name = completion[0]['model'] | |
message_content = completion[1]['choices'][0]['message']['content'] | |
if model_name == base_model: | |
baseModelCompletions.append(message_content) | |
elif model_name == top_oss_model: | |
topOSSModelCompletions.append(message_content) | |
elif model_name == finetuned_model: | |
finetunedModelCompletions.append(message_content) | |
for idx, example in enumerate(eval_data): | |
results.append( | |
{ | |
"groundTruthAnswer": example["output"], | |
"baseModelAnswer": baseModelCompletions[idx], | |
"topOSSModelAnswer": topOSSModelCompletions[idx], | |
"fineTunedModelAnswer": finetunedModelCompletions[idx], | |
} | |
) | |
# 2. Send the responses from base model & finetuned model to LLama-3-70B to grade them on accuracy | |
with open("results.json", "w", encoding="utf-8") as results_file: | |
json.dump(results, results_file, indent=4) | |
baseModelCount = 0 | |
topOSSModelCount = 0 | |
fineTunedModelCount = 0 | |
badResponses = 0 | |
numErrors = 0 | |
# Function to prepare the request list | |
def prepare_request_list(model_answer_key, evaluator_model): | |
return [ | |
{ | |
"model": evaluator_model, | |
"metadata": {"mak": model_answer_key}, | |
"messages": [ | |
{ | |
"role": "system", | |
"content": "You will be given a ground truth answer and a model answer. Please output ACCURATE if the model answer matches the ground truth answer or INACCURATE otherwise. Please only return ACCURATE or INACCURATE. It is very important for my job that you do this.", | |
}, | |
{ | |
"role": "user", | |
"content": f""" | |
<GroundTruthAnswer> | |
{result["groundTruthAnswer"]} | |
</GroundTruthAnswer> | |
<ModelAnswer> | |
{result[model_answer_key]} | |
</ModelAnswer> | |
""", | |
}, | |
], | |
} for result in results | |
] | |
# Function to count the results | |
def count_results(evaluations): | |
global badResponses | |
count = 0 | |
for evaluation in evaluations: | |
eval_answer = evaluation[1]["choices"][0]["message"]["content"] | |
if eval_answer== "ACCURATE": | |
count += 1 | |
elif eval_answer != "INACCURATE": | |
badResponses += 1 | |
return count | |
request_list = prepare_request_list("baseModelAnswer", evaluator_model) + prepare_request_list("topOSSModelAnswer", evaluator_model) + prepare_request_list("fineTunedModelAnswer", evaluator_model) | |
evaluations = llm.generate(request_list) | |
base_model_evaluations = [] | |
top_oss_model_evaluations = [] | |
finetuned_model_evaluations = [] | |
for evaluation in evaluations: | |
mak = evaluation[-1]['mak'] | |
if mak == "baseModelAnswer": | |
base_model_evaluations.append(evaluation) | |
elif mak == "topOSSModelAnswer": | |
top_oss_model_evaluations.append(evaluation) | |
elif mak == "fineTunedModelAnswer": | |
finetuned_model_evaluations.append(evaluation) | |
baseModelCount = count_results(base_model_evaluations) | |
topOSSModelCount = count_results(top_oss_model_evaluations) | |
fineTunedModelCount = count_results(finetuned_model_evaluations) | |
print("Base model count: ", baseModelCount) | |
print("Top OSS model count: ", topOSSModelCount) | |
print("Fine-tuned model count: ", fineTunedModelCount) | |
print("Bad responses count: ", badResponses) | |
print("Number of errors: ", numErrors) | |
print("\n=== Results – accuracy (%) ===\n") | |
print("Base model (Llama-3-8b): ", f"{baseModelCount / len(results) *100 }%") | |
print( | |
"Top OSS model (Llama-3-70b): ", | |
f"{topOSSModelCount / len(results) * 100}%", | |
) | |
print("Fine-tuned model: ", f"{fineTunedModelCount / len(results) *100 }%") | |
end_time = time.time() # Stop the timer | |
total_time = end_time - start_time # Calculate total time | |
print(f"\nTotal execution time: {total_time} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment