Peilun-Li/geval-logprob.ipynb

## geval-logprob.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e19e1743-b8f8-4670-b965-95f1a5496902",
   "metadata": {},
   "outputs": [],
   "source": [
    "import deepeval\n",
    "\n",
    "import pandas as pd\n",
    "from deepeval import evaluate\n",
    "from deepeval.test_case import LLMTestCase\n",
    "from deepeval.metrics import GEval\n",
    "from deepeval.test_case import LLMTestCaseParams\n",
    "from openai import OpenAI\n",
    "from deepeval.metrics.g_eval.template import GEvalTemplate\n",
    "from deepeval.utils import trimAndLoadJson\n",
    "\n",
    "client = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "90550ff6-7b6e-40d4-8f95-68835cf81f41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.20.92'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "deepeval.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3b838063-e367-4f31-8122-485fe3125077",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Agent: Good afternoon! This is Alex from Sunshine Realty. How may I assist you today?\n",
      "\n",
      "Buyer: Hi Alex, my name is Jordan. I'm looking to buy a house in the Greater Seattle area and I came across your contact online.\n",
      "\n",
      "Agent: Wonderful to hear from you, Jordan! Seattle is a great choice. Do you have any particular neighborhoods in mind?\n",
      "\n",
      "Buyer: I've been looking at a few areas like Bellevue, Kirkland, and Queen Anne. I'm looking for a place that's family-friendly since I have two kids.\n",
      "\n",
      "Agent: Those are excellent choices, each with its unique charm and amenities. What type of home are you looking for, and what's your budget?\n",
      "\n",
      "Buyer: I'm interested in a single-family home, preferably with at least three bedrooms and a nice backyard. My budget is around $800,000 to $1,000,000.\n",
      "\n",
      "Agent: That's a healthy budget for those areas. We can certainly find something that meets your needs. How soon are you looking to move?\n",
      "\n",
      "Buyer: Ideally, I'd like to move within the next six months. I'm starting a new job in the area and would like to settle in as soon as possible.\n",
      "\n",
      "Agent: Six months gives us a good timeframe to work with. Have you been pre-approved for a mortgage yet?\n",
      "\n",
      "Buyer: Not yet, but I have a meeting with my bank next week to discuss the pre-approval process.\n",
      "\n",
      "Agent: That's a great first step. Getting pre-approved will give you a better idea of what you can afford and makes you a more attractive buyer to sellers.\n",
      "\n",
      "Buyer: I've also been doing some research on schools in the area. Do you have any insights on the school districts?\n",
      "\n",
      "Agent: Absolutely, the Bellevue School District is highly rated, and both Kirkland and Queen Anne have excellent schools as well. I can provide you with more detailed information on each district if you'd like.\n",
      "\n",
      "Buyer: That would be great, thanks. I also work from home, so having a space that can be used as a home office is important.\n",
      "\n",
      "Agent: Understood. Many homes in these areas have extra rooms or spaces that can be easily converted into a home office. I'll make sure to include that in our search criteria.\n",
      "\n",
      "Buyer: Perfect. What are the next steps from here?\n",
      "\n",
      "Agent: I'll start by sending you a list of current listings that match your criteria. We can then schedule some viewings for you to get a feel for the homes and neighborhoods.\n",
      "\n",
      "Buyer: Sounds good. How do I get the listings?\n",
      "\n",
      "Agent: I'll email them to you. Can I get the best email address to send those to?\n",
      "\n",
      "Buyer: Sure, it's jordan@email.com.\n",
      "\n",
      "Agent: Great, I'll send those over shortly. In the meantime, feel free to reach out if you have any questions or if you see a listing elsewhere that you're curious about.\n",
      "\n",
      "Buyer: Will do. Thank you for your help, Alex. I'm looking forward to working with you.\n",
      "\n",
      "Agent: It's my pleasure, Jordan. I'm here to help you find the perfect home for your family. We'll be in touch soon. Have a great day!\n",
      "\n",
      "Buyer: You too, goodbye.\n",
      "\n",
      "Agent: Goodbye!\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Synthetic call generated by GPT\n",
    "call_transcript = \"\"\"\n",
    "Agent: Good afternoon! This is Alex from Sunshine Realty. How may I assist you today?\n",
    "\n",
    "Buyer: Hi Alex, my name is Jordan. I'm looking to buy a house in the Greater Seattle area and I came across your contact online.\n",
    "\n",
    "Agent: Wonderful to hear from you, Jordan! Seattle is a great choice. Do you have any particular neighborhoods in mind?\n",
    "\n",
    "Buyer: I've been looking at a few areas like Bellevue, Kirkland, and Queen Anne. I'm looking for a place that's family-friendly since I have two kids.\n",
    "\n",
    "Agent: Those are excellent choices, each with its unique charm and amenities. What type of home are you looking for, and what's your budget?\n",
    "\n",
    "Buyer: I'm interested in a single-family home, preferably with at least three bedrooms and a nice backyard. My budget is around $800,000 to $1,000,000.\n",
    "\n",
    "Agent: That's a healthy budget for those areas. We can certainly find something that meets your needs. How soon are you looking to move?\n",
    "\n",
    "Buyer: Ideally, I'd like to move within the next six months. I'm starting a new job in the area and would like to settle in as soon as possible.\n",
    "\n",
    "Agent: Six months gives us a good timeframe to work with. Have you been pre-approved for a mortgage yet?\n",
    "\n",
    "Buyer: Not yet, but I have a meeting with my bank next week to discuss the pre-approval process.\n",
    "\n",
    "Agent: That's a great first step. Getting pre-approved will give you a better idea of what you can afford and makes you a more attractive buyer to sellers.\n",
    "\n",
    "Buyer: I've also been doing some research on schools in the area. Do you have any insights on the school districts?\n",
    "\n",
    "Agent: Absolutely, the Bellevue School District is highly rated, and both Kirkland and Queen Anne have excellent schools as well. I can provide you with more detailed information on each district if you'd like.\n",
    "\n",
    "Buyer: That would be great, thanks. I also work from home, so having a space that can be used as a home office is important.\n",
    "\n",
    "Agent: Understood. Many homes in these areas have extra rooms or spaces that can be easily converted into a home office. I'll make sure to include that in our search criteria.\n",
    "\n",
    "Buyer: Perfect. What are the next steps from here?\n",
    "\n",
    "Agent: I'll start by sending you a list of current listings that match your criteria. We can then schedule some viewings for you to get a feel for the homes and neighborhoods.\n",
    "\n",
    "Buyer: Sounds good. How do I get the listings?\n",
    "\n",
    "Agent: I'll email them to you. Can I get the best email address to send those to?\n",
    "\n",
    "Buyer: Sure, it's jordan@email.com.\n",
    "\n",
    "Agent: Great, I'll send those over shortly. In the meantime, feel free to reach out if you have any questions or if you see a listing elsewhere that you're curious about.\n",
    "\n",
    "Buyer: Will do. Thank you for your help, Alex. I'm looking forward to working with you.\n",
    "\n",
    "Agent: It's my pleasure, Jordan. I'm here to help you find the perfect home for your family. We'll be in touch soon. Have a great day!\n",
    "\n",
    "Buyer: You too, goodbye.\n",
    "\n",
    "Agent: Goodbye!\n",
    "\"\"\"\n",
    "print(call_transcript)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c3dc80ff-5201-4081-a207-b8413ba1f275",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Relevance - selection of important content from the source. Given input as the summarized source and actual_output as the summary, the summary should include only important information from the source document. Penalize summaries which contained redundancies and excess information.\n",
      "['Compare the actual_output with the input to identify if the summary contains the most important content from the source.', 'Check for any redundancies in the actual_output that repeat information unnecessarily.', 'Identify any excess information in the actual_output that was not present or deemed important in the input.', 'Ensure that the actual_output does not omit crucial information from the input that affects the understanding of the topic.']\n"
     ]
    }
   ],
   "source": [
    "relevance_metric = GEval(\n",
    "    name=\"Relevance\",\n",
    "    criteria='Relevance - selection of important content from the source. Given input as the summarized source and actual_output as the summary, the summary should include only important information from the source document. Penalize summaries which contained redundancies and excess information.',\n",
    "    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],\n",
    "    async_mode=False,\n",
    ")\n",
    "\n",
    "relevance_metric.evaluation_steps = relevance_metric._generate_evaluation_steps()\n",
    "print(relevance_metric.criteria)\n",
    "print(relevance_metric.evaluation_steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7428aafa-d548-4d6e-aed9-e47d9068a95f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The buyer, Jordan, is looking to buy a family-friendly single-family home in the Greater Seattle area with a budget of $800,000 to $1,000,000. They are interested in neighborhoods like Bellevue, Kirkland, and Queen Anne and have two kids. Jordan has not been pre-approved for a mortgage yet but has a meeting scheduled for next week. They are also looking for a home with a potential home office space and have been researching school districts in the area. The agent, Alex, will send Jordan a list of current listings that match their criteria and schedule viewings. They plan to work together to find the perfect home within the next six months.\n"
     ]
    }
   ],
   "source": [
    "test_case = LLMTestCase(\n",
    "    input=call_transcript,\n",
    "    actual_output='The buyer, Jordan, is looking to buy a family-friendly single-family home in the Greater Seattle area with a budget of $800,000 to $1,000,000. They are interested in neighborhoods like Bellevue, Kirkland, and Queen Anne and have two kids. Jordan has not been pre-approved for a mortgage yet but has a meeting scheduled for next week. They are also looking for a home with a potential home office space and have been researching school districts in the area. The agent, Alex, will send Jordan a list of current listings that match their criteria and schedule viewings. They plan to work together to find the perfect home within the next six months.'\n",
    ")\n",
    "print(test_case.actual_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f1ca680d-d199-4f6b-a226-c44008d40cc5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The buyer, Jordan, is looking to buy a family-friendly single-family home in the Greater Seattle area with a budget of $900,000. They are interested in neighborhoods like Bellevue, Kirkland, and Queen Anne and have two kids. Jordan has not been pre-approved for a mortgage yet but has a meeting scheduled for next week. They are also looking for a home with a potential home office space and have been researching school districts in the area. The agent, Alex, will send Jordan a list of current listings that match their criteria and schedule viewings. They plan to work together to find the perfect home within the next six months.\n"
     ]
    }
   ],
   "source": [
    "# I only changed the budget value in the output\n",
    "test_case_2 = LLMTestCase(\n",
    "    input=call_transcript,\n",
    "    actual_output='The buyer, Jordan, is looking to buy a family-friendly single-family home in the Greater Seattle area with a budget of $900,000. They are interested in neighborhoods like Bellevue, Kirkland, and Queen Anne and have two kids. Jordan has not been pre-approved for a mortgage yet but has a meeting scheduled for next week. They are also looking for a home with a potential home office space and have been researching school districts in the area. The agent, Alex, will send Jordan a list of current listings that match their criteria and schedule viewings. They plan to work together to find the perfect home within the next six months.'\n",
    ")\n",
    "print(test_case_2.actual_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ae664c40-85cc-413c-90ae-18415a23558a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from typing import Dict\n",
    "from langchain_core.messages import HumanMessage\n",
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "def get_metric_prompt(metric, test_case):\n",
    "    text = \"\"\"\"\"\"\n",
    "    for param in metric.evaluation_params:\n",
    "        value = getattr(test_case, param.value)\n",
    "        text += f\"{param.value}: {value} \\n\\n\"\n",
    "        \n",
    "        prompt = GEvalTemplate.generate_evaluation_results(\n",
    "            evaluation_steps=metric.number_evaluation_steps(),\n",
    "            text=text,\n",
    "        )\n",
    "        \n",
    "        metric.prompt = prompt\n",
    "    return prompt\n",
    "\n",
    "\n",
    "def generate_logprob_based_score(score_logprob_dict):\n",
    "    \"\"\"\n",
    "    example input:\n",
    "    {\n",
    "        'token': '9',\n",
    "        'bytes': [57],\n",
    "        'logprob': -0.18066935,\n",
    "        'top_logprobs': [\n",
    "            {'token': '9', 'bytes': [57], 'logprob': -0.18066935},\n",
    "            {'token': '8', 'bytes': [56], 'logprob': -1.8056693},\n",
    "            {'token': '10', 'bytes': [49, 48], 'logprob': -7.1337943},\n",
    "            {'token': '7', 'bytes': [55], 'logprob': -8.977545},\n",
    "            {'token': ' ', 'bytes': [32], 'logprob': -15.477545},\n",
    "            {'token': '6', 'bytes': [54], 'logprob': -17.133795},\n",
    "            {'token': '5', 'bytes': [53], 'logprob': -20.352545},\n",
    "            {'token': '09', 'bytes': [48, 57], 'logprob': -21.83692},\n",
    "            {'token': '0', 'bytes': [48], 'logprob': -22.383795},\n",
    "            {'token': ' nine', 'bytes': [32, 110, 105, 110, 101], 'logprob': -22.74317},\n",
    "            {'token': '4', 'bytes': [52], 'logprob': -22.875982},\n",
    "            {'token': '08', 'bytes': [48, 56], 'logprob': -22.99317},\n",
    "            {'token': '<|end|>', 'bytes': None, 'logprob': -23.469732},\n",
    "            {'token': '９', 'bytes': [239, 188, 153], 'logprob': -23.625982},\n",
    "            {'token': '\\xa0', 'bytes': [194, 160], 'logprob': -24.079107},\n",
    "            {'token': '3', 'bytes': [51], 'logprob': -24.125982},\n",
    "            {'token': ' eight',\n",
    "            'bytes': [32, 101, 105, 103, 104, 116],\n",
    "            'logprob': -24.39942},\n",
    "            {'token': '90', 'bytes': [57, 48], 'logprob': -24.454107},\n",
    "            {'token': '８', 'bytes': [239, 188, 152], 'logprob': -24.89942},\n",
    "            {'token': '1', 'bytes': [49], 'logprob': -25.329107}\n",
    "        ]\n",
    "    }\n",
    "    \"\"\"\n",
    "    \n",
    "    token_probability: Dict[int, float] = {}\n",
    "    sum_linear_prob = 0.0\n",
    "    for token_logprob in score_logprob_dict[\"top_logprobs\"]:\n",
    "        # filter out tokens with <1% linear possbility, i.e., logprobs < np.log(0.01) ~= -4.605\n",
    "        if token_logprob[\"logprob\"] < -4.605:\n",
    "            continue\n",
    "        # filter out non-decimal token\n",
    "        if not token_logprob[\"token\"].isdecimal():\n",
    "            continue\n",
    "            \n",
    "        linear_prob = np.exp(token_logprob[\"logprob\"])\n",
    "        sum_linear_prob += linear_prob\n",
    "        token_probability[int(token_logprob[\"token\"])] = linear_prob\n",
    "    \n",
    "    print(token_probability)\n",
    "    weighted_summed_score = 0.0\n",
    "    for score, linear_prob in token_probability.items():\n",
    "        weighted_summed_score += score * linear_prob\n",
    "    \n",
    "    # scale sum of probability to 1\n",
    "    weighted_summed_score /= sum_linear_prob\n",
    "    \n",
    "    return weighted_summed_score\n",
    "    \n",
    "\n",
    "def score(metric, test_case):\n",
    "    llm = ChatOpenAI(\n",
    "        temperature=0,\n",
    "        model_name=\"gpt-4-0125-preview\",\n",
    "        model_kwargs = {\n",
    "            \"logprobs\": True,\n",
    "            \"top_logprobs\":20}\n",
    "    ) \n",
    "\n",
    "    logprob_generate = llm.generate([[HumanMessage(content=get_metric_prompt(metric, test_case))]])\n",
    "\n",
    "    print(trimAndLoadJson(logprob_generate.generations[0][0].text))\n",
    "    score = trimAndLoadJson(logprob_generate.generations[0][0].text)[\"score\"]\n",
    "\n",
    "    for logprob_dict in logprob_generate.generations[0][0].generation_info[\"logprobs\"][\"content\"]:\n",
    "        if(logprob_dict[\"token\"] == str(score)):\n",
    "            score_logprob_dict = logprob_dict\n",
    "            break\n",
    "\n",
    "    print(score_logprob_dict)\n",
    "    return generate_logprob_based_score(score_logprob_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9af1d5f7-2da1-4208-a859-434385879101",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 9, 'reason': \"The actual_output effectively summarizes the key points of the conversation, capturing the buyer's requirements, budget, and timeframe, as well as the agent's response and next steps. It omits minor details but retains all crucial information for understanding the topic, ensuring a concise and relevant summary without unnecessary repetition or excess information.\"}\n",
      "{'token': '9', 'bytes': [57], 'logprob': -0.09826567, 'top_logprobs': [{'token': '9', 'bytes': [57], 'logprob': -0.09826567}, {'token': '8', 'bytes': [56], 'logprob': -2.3795156}, {'token': '10', 'bytes': [49, 48], 'logprob': -6.9263906}, {'token': '7', 'bytes': [55], 'logprob': -11.129516}, {'token': ' ', 'bytes': [32], 'logprob': -15.317016}, {'token': '6', 'bytes': [54], 'logprob': -19.817017}, {'token': '09', 'bytes': [48, 57], 'logprob': -22.067017}, {'token': ' nine', 'bytes': [32, 110, 105, 110, 101], 'logprob': -22.223267}, {'token': '0', 'bytes': [48], 'logprob': -22.379517}, {'token': '5', 'bytes': [53], 'logprob': -22.723267}, {'token': '<|end|>', 'bytes': None, 'logprob': -23.35608}, {'token': '９', 'bytes': [239, 188, 153], 'logprob': -23.692017}, {'token': '08', 'bytes': [48, 56], 'logprob': -23.715454}, {'token': '\\xa0', 'bytes': [194, 160], 'logprob': -23.754517}, {'token': ' eight', 'bytes': [32, 101, 105, 103, 104, 116], 'logprob': -24.559204}, {'token': '4', 'bytes': [52], 'logprob': -24.848267}, {'token': '90', 'bytes': [57, 48], 'logprob': -24.879517}, {'token': '\\n\\n', 'bytes': [10, 10], 'logprob': -25.035767}, {'token': '3', 'bytes': [51], 'logprob': -25.48108}, {'token': '1', 'bytes': [49], 'logprob': -25.527954}]}\n",
      "{9: 0.906408066332716, 8: 0.09259541987003198}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "8.907312215473851"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score(relevance_metric, test_case)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9badd50e-52d9-4729-99a7-cb4221d269a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 9, 'reason': \"The summary effectively captures the most important content from the conversation, including Jordan's requirements for a new home, budget, and timeframe, without including unnecessary details. However, it slightly overestimates the budget range by stating it as $900,000 instead of the specified range of $800,000 to $1,000,000, which could be seen as omitting crucial budget flexibility information.\"}\n",
      "{'token': '9', 'bytes': [57], 'logprob': -0.46307245, 'top_logprobs': [{'token': '9', 'bytes': [57], 'logprob': -0.46307245}, {'token': '8', 'bytes': [56], 'logprob': -0.9943224}, {'token': '7', 'bytes': [55], 'logprob': -7.8380723}, {'token': '10', 'bytes': [49, 48], 'logprob': -8.166198}, {'token': ' ', 'bytes': [32], 'logprob': -14.947448}, {'token': '6', 'bytes': [54], 'logprob': -16.181822}, {'token': '5', 'bytes': [53], 'logprob': -19.900572}, {'token': '09', 'bytes': [48, 57], 'logprob': -21.494322}, {'token': '08', 'bytes': [48, 56], 'logprob': -21.869322}, {'token': '0', 'bytes': [48], 'logprob': -21.970884}, {'token': '4', 'bytes': [52], 'logprob': -22.220884}, {'token': ' nine', 'bytes': [32, 110, 105, 110, 101], 'logprob': -22.48651}, {'token': ' eight', 'bytes': [32, 101, 105, 103, 104, 116], 'logprob': -22.92401}, {'token': '\\xa0', 'bytes': [194, 160], 'logprob': -23.088072}, {'token': '<|end|>', 'bytes': None, 'logprob': -23.158384}, {'token': '９', 'bytes': [239, 188, 153], 'logprob': -23.42401}, {'token': '3', 'bytes': [51], 'logprob': -23.48651}, {'token': '８', 'bytes': [239, 188, 152], 'logprob': -23.58026}, {'token': '90', 'bytes': [57, 48], 'logprob': -23.838072}, {'token': '\\n\\n', 'bytes': [10, 10], 'logprob': -24.33026}]}\n",
      "{9: 0.6293470346637394, 8: 0.36997405404699285}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "8.629774595746486"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score(relevance_metric, test_case_2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}