Skip to content

Instantly share code, notes, and snippets.

@satpalsr
Last active July 14, 2024 15:59
Show Gist options
  • Save satpalsr/690cd7ca480964a729f1b320b8b90110 to your computer and use it in GitHub Desktop.
Save satpalsr/690cd7ca480964a729f1b320b8b90110 to your computer and use it in GitHub Desktop.
# PART 3 – an evaluation script to test the accuracy of the fine-tuned model vs the base model.
# We will use Dataformer for making API requests to Together AI. It respects rate-limits & supports cache - providing speed & saving money.
# pip install dataformer@git+https://github.com/DataformerAI/dataformer.git
import json
from dataformer.llms import AsyncLLM
from dotenv import load_dotenv
load_dotenv()
import os
import time
start_time = time.time()
api_key=os.environ.get("TOGETHER_API_KEY")
max_requests_per_minute = 300
base_model = "meta-llama/Llama-3-8b-chat-hf"
top_oss_model = "meta-llama/Llama-3-70b-chat-hf"
finetuned_model = "YOUR_FINETUNED_MODEL_ID"
llm = AsyncLLM(api_provider="together", max_requests_per_minute=max_requests_per_minute, api_key=api_key)
evaluator_model = "meta-llama/Llama-3-70b-chat-hf"
eval_dataset = "EvalDataset-100.json"
# 1. Get all responses for the eval dataset
with open(eval_dataset, "r", encoding="utf-8") as eval_file:
eval_data = json.load(eval_file)
results = []
def initial_request_list(model, eval_data):
return [
{"model": model, "messages": [{"role": "user", "content": example["instruction"]}], "max_tokens": 1500} for example in eval_data
]
request_list = initial_request_list(base_model, eval_data) + initial_request_list(top_oss_model, eval_data) + initial_request_list(finetuned_model, eval_data)
completions = llm.generate(request_list)
# Initialize empty lists for each model's completions
baseModelCompletions = []
topOSSModelCompletions = []
finetunedModelCompletions = []
# Iterate over completions and categorize them based on the model name
for completion in completions:
model_name = completion[0]['model']
message_content = completion[1]['choices'][0]['message']['content']
if model_name == base_model:
baseModelCompletions.append(message_content)
elif model_name == top_oss_model:
topOSSModelCompletions.append(message_content)
elif model_name == finetuned_model:
finetunedModelCompletions.append(message_content)
for idx, example in enumerate(eval_data):
results.append(
{
"groundTruthAnswer": example["output"],
"baseModelAnswer": baseModelCompletions[idx],
"topOSSModelAnswer": topOSSModelCompletions[idx],
"fineTunedModelAnswer": finetunedModelCompletions[idx],
}
)
# 2. Send the responses from base model & finetuned model to LLama-3-70B to grade them on accuracy
with open("results.json", "w", encoding="utf-8") as results_file:
json.dump(results, results_file, indent=4)
baseModelCount = 0
topOSSModelCount = 0
fineTunedModelCount = 0
badResponses = 0
numErrors = 0
# Function to prepare the request list
def prepare_request_list(model_answer_key, evaluator_model):
return [
{
"model": evaluator_model,
"metadata": {"mak": model_answer_key},
"messages": [
{
"role": "system",
"content": "You will be given a ground truth answer and a model answer. Please output ACCURATE if the model answer matches the ground truth answer or INACCURATE otherwise. Please only return ACCURATE or INACCURATE. It is very important for my job that you do this.",
},
{
"role": "user",
"content": f"""
<GroundTruthAnswer>
{result["groundTruthAnswer"]}
</GroundTruthAnswer>
<ModelAnswer>
{result[model_answer_key]}
</ModelAnswer>
""",
},
],
} for result in results
]
# Function to count the results
def count_results(evaluations):
global badResponses
count = 0
for evaluation in evaluations:
eval_answer = evaluation[1]["choices"][0]["message"]["content"]
if eval_answer== "ACCURATE":
count += 1
elif eval_answer != "INACCURATE":
badResponses += 1
return count
request_list = prepare_request_list("baseModelAnswer", evaluator_model) + prepare_request_list("topOSSModelAnswer", evaluator_model) + prepare_request_list("fineTunedModelAnswer", evaluator_model)
evaluations = llm.generate(request_list)
base_model_evaluations = []
top_oss_model_evaluations = []
finetuned_model_evaluations = []
for evaluation in evaluations:
mak = evaluation[-1]['mak']
if mak == "baseModelAnswer":
base_model_evaluations.append(evaluation)
elif mak == "topOSSModelAnswer":
top_oss_model_evaluations.append(evaluation)
elif mak == "fineTunedModelAnswer":
finetuned_model_evaluations.append(evaluation)
baseModelCount = count_results(base_model_evaluations)
topOSSModelCount = count_results(top_oss_model_evaluations)
fineTunedModelCount = count_results(finetuned_model_evaluations)
print("Base model count: ", baseModelCount)
print("Top OSS model count: ", topOSSModelCount)
print("Fine-tuned model count: ", fineTunedModelCount)
print("Bad responses count: ", badResponses)
print("Number of errors: ", numErrors)
print("\n=== Results – accuracy (%) ===\n")
print("Base model (Llama-3-8b): ", f"{baseModelCount / len(results) *100 }%")
print(
"Top OSS model (Llama-3-70b): ",
f"{topOSSModelCount / len(results) * 100}%",
)
print("Fine-tuned model: ", f"{fineTunedModelCount / len(results) *100 }%")
end_time = time.time() # Stop the timer
total_time = end_time - start_time # Calculate total time
print(f"\nTotal execution time: {total_time} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment