Skip to content

Instantly share code, notes, and snippets.

@passaglia
Created June 16, 2023 05:56
Show Gist options
  • Save passaglia/bd6d30e491c8af7361dcb97243e0cd22 to your computer and use it in GitHub Desktop.
Save passaglia/bd6d30e491c8af7361dcb97243e0cd22 to your computer and use it in GitHub Desktop.
Evaluate GPT on JCQA benchmark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "4bc45018-ee85-4eaa-bdf6-c627a40058da",
"metadata": {},
"source": [
"# Evaluate GPT performance on JCQA task\n",
"## Using OpenAI's evals framework"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ada8dde9-5388-4b3f-8cc7-2d5c384a1887",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Install openai evals framework\n",
"!git clone git@github.com:openai/evals.git\n",
"%pip install -qe ./evals/\n",
"\n",
"# Download JGLUE if you haven't already\n",
"\n",
"!git clone git@github.com:yahoojapan/JGLUE.git"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c8852397-92f7-4a2a-8eb5-939727d094e7",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import openai\n",
"import pandas as pd\n",
"import tiktoken\n",
"import yaml\n",
"\n",
"# Set your API key\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"# Point to the dataset and the evals registry\n",
"data_pth = \"JGLUE/datasets/jcommonsenseqa-v1.1\"\n",
"registry_pth = os.path.join(os.getcwd(), \"evals/evals/registry\")\n",
"\n",
"# Choose your model\n",
"model = \"gpt-3.5-turbo\"\n",
"enc = tiktoken.encoding_for_model(model)\n",
"\n",
"# Choose N-shot\n",
"num_few_shot = 1\n",
"\n",
"# Maximum number of test questions to run for testing purposes\n",
"# 0 means unlimited\n",
"limit = 0 "
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "c28b1346-2e98-4f92-b95f-7c4bcbfc18cf",
"metadata": {},
"outputs": [],
"source": [
"# Build the prompts using Chat format. Supports converting Chat conversations to text for non-Chat models\n",
"\n",
"choices = [\"0\", \"1\", \"2\", \"3\", \"4\"]\n",
"sys_msg = \"The following are multiple choice questions with answers.\"\n",
"\n",
"\n",
"def create_chat_prompt(sys_msg, question, answers):\n",
" user_prompt = (\n",
" f\"{question}\\n\"\n",
" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)])\n",
" + \"\\nAnswer:\"\n",
" )\n",
" return [\n",
" {\"role\": \"system\", \"content\": sys_msg},\n",
" {\"role\": \"user\", \"content\": user_prompt},\n",
" ]\n",
"\n",
"\n",
"def create_chat_example(question, answers, correct_answer):\n",
" \"\"\"\n",
" Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
" \"\"\"\n",
" user_prompt = (\n",
" f\"{question}\\n\"\n",
" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)])\n",
" + \"\\nAnswer:\"\n",
" )\n",
" return [\n",
" {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n",
" {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n",
" ]\n",
"\n",
"\n",
"# Create few-shot prompts\n",
"dev_df = pd.read_json(os.path.join(data_pth, \"train-v1.1.json\"), lines=True)\n",
"dev_df[\"label\"] = dev_df[\"label\"].astype(\"str\")\n",
"dev_df[\"sample\"] = dev_df.apply(\n",
" lambda x: create_chat_example(\n",
" x[\"question\"],\n",
" x[[\"choice0\", \"choice1\", \"choice2\", \"choice3\", \"choice4\"]],\n",
" x[\"label\"],\n",
" ),\n",
" axis=1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "6271fe83-383e-45f2-9f61-b0aa2c095d67",
"metadata": {},
"outputs": [],
"source": [
"# Create the test question prompts and the ideal completions\n",
"test_df = pd.read_json(os.path.join(data_pth, \"valid-v1.1.json\"), lines=True)\n",
"test_df[\"label\"] = test_df[\"label\"].astype(\"str\")\n",
"test_df[\"input\"] = test_df.apply(\n",
" lambda x: create_chat_prompt(\n",
" sys_msg,\n",
" x[\"question\"],\n",
" x[[\"choice0\", \"choice1\", \"choice2\", \"choice3\", \"choice4\"]],\n",
" ),\n",
" axis=1,\n",
")\n",
"test_df[\"ideal\"] = test_df[\"label\"]\n",
"\n",
"# For testing purposes shrink the test set\n",
"if limit:\n",
" test_df = test_df.sample(limit).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "86b8af6c-b182-4d2d-aebf-e13bf1dabbe6",
"metadata": {},
"outputs": [],
"source": [
"# Generate a task folder inside oaievals\n",
"task_pth = os.path.join(registry_pth, \"data\", \"jglue\", \"jcqa\")\n",
"os.makedirs(task_pth, exist_ok=True)\n",
"\n",
"# Save the few shot prompts file\n",
"few_shot_pth = os.path.join(task_pth, \"few_shot.jsonl\")\n",
"dev_df[[\"sample\"]].to_json(\n",
" few_shot_pth, lines=True, force_ascii=False, orient=\"records\"\n",
")\n",
"\n",
"# Save the test questions file\n",
"samples_pth = os.path.join(task_pth, \"samples.jsonl\")\n",
"test_df[[\"input\", \"ideal\"]].to_json(\n",
" samples_pth, force_ascii=False, lines=True, orient=\"records\"\n",
")\n",
"\n",
"# Register the task with oaievals\n",
"eval_id = f\"match_jcqa\"\n",
"registry_yaml = {}\n",
"registry_yaml[eval_id] = {\"id\": f\"{eval_id}.test.v1\", \"metrics\": [\"accuracy\"]}\n",
"registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
" \"class\": \"evals.elsuite.basic.match:Match\",\n",
" \"args\": {\n",
" \"samples_jsonl\": samples_pth,\n",
" \"few_shot_jsonl\": few_shot_pth,\n",
" \"num_few_shot\": num_few_shot,\n",
" },\n",
"}\n",
"\n",
"with open(os.path.join(registry_pth, \"evals\", \"jcqa.yaml\"), \"w\") as f:\n",
" yaml.dump(registry_yaml, f)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "b12e65b7-cfff-47db-a2ef-9a0adcf9b0e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Doing this query would take at least 76036 tokens\n",
"Estimated cost 0.152072$\n"
]
}
],
"source": [
"# Estimate cost of running the eval (cost estimate for gpt-3.5-turbo)\n",
"\n",
"ntokens = sum(\n",
" [\n",
" len(\n",
" enc.encode(\n",
" test_df[\"input\"][i][0][\"content\"] + test_df[\"input\"][i][1][\"content\"]\n",
" )\n",
" )\n",
" for i in range(len(test_df[\"input\"]))\n",
" ]\n",
")\n",
"print(f\"Doing this query would take at least {ntokens} tokens\")\n",
"print(f\"Estimated cost {ntokens*0.002/1000}$\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "d5102adf-e7c6-408a-89bb-931218a72e19",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2023-06-16 14:18:23,491] [registry.py:262] Loading registry from /home/passaglia/projects/jbenchmark/evals/evals/registry/evals\n",
"[2023-06-16 14:18:24,154] [registry.py:262] Loading registry from /home/passaglia/.evals/evals\n",
"[2023-06-16 14:18:24,155] [oaieval.py:138] \u001b[1;35mRun started: 230616051824HXZMD6LB\u001b[0m\n",
"[2023-06-16 14:18:24,157] [data.py:83] Fetching /home/passaglia/projects/jbenchmark/evals/evals/registry/data/jglue/jcqa/few_shot.jsonl\n",
"[2023-06-16 14:18:24,301] [data.py:83] Fetching /home/passaglia/projects/jbenchmark/evals/evals/registry/data/jglue/jcqa/samples.jsonl\n",
"[2023-06-16 14:18:24,314] [eval.py:33] Evaluating 1119 samples\n",
"[2023-06-16 14:18:24,320] [eval.py:139] Running in threaded mode with 10 threads!\n",
" 16%|██████▏ | 174/1119 [00:09<00:47, 20.07it/s][2023-06-16 14:18:34,276] [record.py:330] Logged 353 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=81.470ms\n",
" 30%|████████████ | 338/1119 [00:19<00:55, 14.15it/s][2023-06-16 14:18:44,366] [record.py:330] Logged 324 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=82.643ms\n",
" 44%|█████████████████▋ | 494/1119 [00:29<00:46, 13.57it/s][2023-06-16 14:18:54,453] [record.py:330] Logged 318 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=70.104ms\n",
" 49%|███████████████████▊ | 553/1119 [00:33<00:36, 15.31it/s][2023-06-16 14:18:58,291] [_common.py:105] Backing off openai_chat_completion_create_retrying(...) for 1.3s (openai.error.RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 73936f8b8012e9c9a50393008fde911f in your message.))\n",
" 58%|███████████████████████▏ | 650/1119 [00:39<00:27, 16.76it/s][2023-06-16 14:19:04,505] [record.py:330] Logged 308 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=47.418ms\n",
" 72%|████████████████████████████▋ | 802/1119 [00:49<00:23, 13.36it/s][2023-06-16 14:19:13,470] [_common.py:105] Backing off openai_chat_completion_create_retrying(...) for 1.2s (openai.error.RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 78dc5034c285b9bab25cd35d61fa2309 in your message.))\n",
" 73%|█████████████████████████████▏ | 818/1119 [00:50<00:19, 15.57it/s][2023-06-16 14:19:14,636] [record.py:330] Logged 336 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=100.527ms\n",
" 84%|█████████████████████████████████▌ | 940/1119 [00:56<00:09, 19.48it/s][2023-06-16 14:19:20,831] [_common.py:105] Backing off openai_chat_completion_create_retrying(...) for 1.4s (openai.error.RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b5650bb1dd95b0b8f89bb5a32508ab95 in your message.))\n",
" 91%|███████████████████████████████████▍ | 1015/1119 [01:00<00:05, 18.95it/s][2023-06-16 14:19:24,733] [record.py:330] Logged 396 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=93.378ms\n",
"100%|███████████████████████████████████████| 1119/1119 [01:04<00:00, 17.27it/s]\n",
"[2023-06-16 14:19:29,671] [record.py:341] Final report: {'accuracy': 0.9159964253798034, 'boostrap_std': 0.008270062342631649}. Logged to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl\n",
"[2023-06-16 14:19:29,672] [oaieval.py:177] Final report:\n",
"[2023-06-16 14:19:29,672] [oaieval.py:179] accuracy: 0.9159964253798034\n",
"[2023-06-16 14:19:29,672] [oaieval.py:179] boostrap_std: 0.008270062342631649\n",
"[2023-06-16 14:19:29,699] [record.py:330] Logged 203 rows of events to ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl: insert_time=26.828ms\n"
]
}
],
"source": [
"# Run the eval\n",
"import sys\n",
"\n",
"! EVALS_THREADS=10 $sys.executable ./evals/evals/cli/oaieval.py gpt-3.5-turbo match_jcqa --extra_eval_params max_tokens=1 --record_path ./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl"
]
},
{
"cell_type": "markdown",
"id": "f2aae124-d0f1-4c85-9728-d51d66f0cb84",
"metadata": {},
"source": [
"# Analyze Results"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ca188c6d-a434-4f2e-8549-649371606b69",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: title={'center': 'Correctness of generated answers'}, xlabel='Correctness', ylabel='Count'>"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHcCAYAAAA3PbXpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAABBi0lEQVR4nO3deVyU5f7/8feAbCIDboDkijuuJWm4pqK4dfSkmWWJ5fY1sNwyrTS1UrOTmUqa1VHr5Kljtlq5hJqm5JrmvuWWClgEuKQoXL8/+jGPRlDRkEHu1/PxmMejua5r7vtz3zMj7657GZsxxggAAMDC3FxdAAAAgKsRiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAUWklJSerRo4dKly4tm82m6dOnu7qkIq1y5crq27evq8sAXIJAhCLr0KFDGjRokEJDQ+Xt7S273a5mzZrpjTfe0B9//OHq8m7Y7t27NX78eB05csTVpRSYYcOGadmyZRozZozef/99dejQwdUludzXX3+t8ePHu7oMoMgp5uoCgFvhq6++0gMPPCAvLy/16dNHdevWVUZGhr7//ns9/fTT2rVrl+bOnevqMm/I7t27NWHCBN17772qXLmyq8spECtXrlTXrl01cuRIV5dSaHz99deKi4sjFAH5jECEIufw4cPq1auXKlWqpJUrV6pcuXKOvpiYGB08eFBfffXV316PMUYXLlyQj49Pjr4LFy7I09NTbm5Mwv4dycnJCggIcHUZt9S5c+fk6+vr6jJwhWt9v1E08a81ipypU6fq7Nmzevfdd53CULZq1arpqaeecjy/fPmyXnzxRVWtWlVeXl6qXLmynn32WV28eNHpdZUrV1aXLl20bNkyhYeHy8fHR2+99ZZWr14tm82mDz/8UM8//7zuuOMOFS9eXOnp6ZKkDRs2qEOHDvL391fx4sXVqlUrrVu3LkddJ06cUL9+/RQSEiIvLy9VqVJFgwcPVkZGhubPn68HHnhAktS6dWvZbDbZbDatXr3aqbbvv/9ejRs3lre3t0JDQ/Xee+/lWE9qaqqGDh2qChUqyMvLS9WqVdMrr7yirKwsp3EffvihGjVqJD8/P9ntdtWrV09vvPGGo//SpUuaMGGCqlevLm9vb5UuXVrNmzfXihUrrvse/fzzz3rggQdUqlQpFS9eXPfcc49TSJ0/f75sNpuMMYqLi3Ns77X89ttvevTRR2W32xUQEKDo6Ght375dNptN8+fPdxq7d+9e9ejRQ6VKlZK3t7fCw8P1xRdfOI3JrmHdunUaPny4ypYtK19fX/3zn//U6dOnc6z/m2++UYsWLeTr6ys/Pz917txZu3btchrTt29flShRQocOHVKnTp3k5+en3r17S5LWrl2rBx54QBUrVpSXl5cqVKigYcOGOR3e7du3r+Li4iTJsU/+ul+ysrI0ffp01alTR97e3goKCtKgQYP0+++/O9VhjNFLL72k8uXLq3jx4mrdunWOWq/lX//6l5o2barSpUvLx8dHjRo10scff5xjnM1mU2xsrD777DPVrVtXXl5eqlOnjpYuXeo07syZMxo6dKgqV64sLy8vBQYGql27dtq6daskacaMGXJ3d1dqaqrjNa+99ppsNpuGDx/uaMvMzJSfn5+eeeaZG94nV/t+S9KKFSvUvHlzBQQEqESJEqpZs6aeffbZPO8v3CYMUMTccccdJjQ0NM/jo6OjjSTTo0cPExcXZ/r06WMkmW7dujmNq1SpkqlWrZopWbKkGT16tJkzZ45ZtWqVWbVqlZFkwsLCTMOGDc20adPM5MmTzblz50x8fLzx9PQ0ERER5rXXXjOvv/66qV+/vvH09DQbNmxwLPvEiRMmJCTEFC9e3AwdOtTMmTPHjB071tSuXdv8/vvv5tChQ+bJJ580ksyzzz5r3n//ffP++++bxMRER201a9Y0QUFB5tlnnzWzZs0yd911l7HZbGbnzp2O9Zw7d87Ur1/flC5d2jz77LNmzpw5pk+fPsZms5mnnnrKMW758uVGkmnbtq2Ji4szcXFxJjY21jzwwAOOMc8++6yx2WxmwIAB5u233zavvfaaeeihh8yUKVOuub8TExNNUFCQ8fPzM88995yZNm2aadCggXFzczOffPKJMcaYQ4cOmffff99IMu3atXNs79VkZmaaiIgI4+7ubmJjY82sWbNMu3btTIMGDYwkM2/ePMfYnTt3Gn9/fxMWFmZeeeUVM2vWLNOyZUtjs9kc6zfGmHnz5hlJ5s477zRt2rQxM2fONCNGjDDu7u6mZ8+eTut/7733jM1mMx06dDAzZ840r7zyiqlcubIJCAgwhw8fdoyLjo42Xl5epmrVqiY6OtrMmTPHvPfee8YYY4YMGWI6depkJk2aZN566y3Tr18/4+7ubnr06OF4/fr16027du2MJMc++et+6d+/vylWrJgZMGCAmTNnjnnmmWeMr6+vufvuu01GRoZj3PPPP28kmU6dOplZs2aZxx9/3ISEhJgyZcqY6Ojoa75/xhhTvnx588QTT5hZs2aZadOmmcaNGxtJZsmSJU7jJJkGDRqYcuXKmRdffNFMnz7dhIaGmuLFi5tff/3VMe7hhx82np6eZvjw4eadd94xr7zyirnvvvvMf/7zH2OMMVu3bjWSzJdfful4TdeuXY2bm5sJDw93tG3atClHHXndJ1f7fu/cudN4enqa8PBw88Ybb5g5c+aYkSNHmpYtW153P+H2QiBCkZKWlmYkma5du+Zp/LZt24wk079/f6f2kSNHGklm5cqVjrZKlSoZSWbp0qVOY7MDUWhoqDl//ryjPSsry1SvXt1ERUWZrKwsR/v58+dNlSpVTLt27Rxtffr0MW5ubmbTpk05asx+7aJFi4wks2rVqhxjsmtbs2aNoy05Odl4eXmZESNGONpefPFF4+vra/bv3+/0+tGjRxt3d3dz7NgxY4wxTz31lLHb7eby5cs5d9r/16BBA9O5c+er9l/N0KFDjSSzdu1aR9uZM2dMlSpVTOXKlU1mZqajXZKJiYm57jIXL15sJJnp06c72jIzM02bNm1yBKK2bduaevXqmQsXLjjasrKyTNOmTU316tUdbdmBKDIy0un9GzZsmHF3dzepqamO2gMCAsyAAQOcakpMTDT+/v5O7dnhe/To0Tm24a+fnWyTJ082NpvNHD161NEWExNjcvt/2bVr1xpJ5oMPPnBqX7p0qVN7cnKy8fT0NJ07d3barmeffdZIylMgurLWjIwMU7duXdOmTRundknG09PTHDx40NG2fft2I8nMnDnT0ebv73/N9zkzM9PY7XYzatQoY8yf71fp0qXNAw88YNzd3c2ZM2eMMcZMmzbNuLm5md9///2G9okxV/9+v/7660aSOX369PV2C25zHDJDkZJ9mMrPzy9P47/++mtJcpp2l6QRI0ZIUo5zjapUqaKoqKhclxUdHe10vsG2bdt04MABPfzww/rtt9/066+/6tdff9W5c+fUtm1brVmzRllZWcrKytJnn32m++67T+Hh4TmWe71DRdnCwsLUokULx/OyZcuqZs2a+vnnnx1tixYtUosWLVSyZElHPb/++qsiIyOVmZmpNWvWSJICAgJ07ty5ax7+CggI0K5du3TgwIE81Zft66+/VuPGjdW8eXNHW4kSJTRw4EAdOXJEu3fvvqHlSdLSpUvl4eGhAQMGONrc3NwUExPjNC4lJUUrV65Uz549debMGcf2//bbb4qKitKBAwd04sQJp9cMHDjQ6T1o0aKFMjMzdfToUUl/Hk5JTU3VQw895LRP3d3d1aRJE61atSpHvYMHD87R9tfPzrlz5/Trr7+qadOmMsboxx9/vO4+WLRokfz9/dWuXTunOho1aqQSJUo46vj222+VkZGhIUOGOG3X0KFDr7uO3Gr9/ffflZaWphYtWjgOcf1VZGSkqlat6nhev3592e12p89lQECANmzYoJMnT+a6Pjc3NzVt2tTx+dyzZ49+++03jR49WsYYJSQkSPrzsGPdunUd553ldZ9ky+37nb2szz//PMdhZRQtnFSNIsVut0v685yEvDh69Kjc3NxUrVo1p/bg4GAFBAQ4/uhlq1KlylWXdWVfdlCIjo6+6mvS0tKUkZGh9PR01a1bN081X03FihVztJUsWdLpXIkDBw7op59+UtmyZXNdRnJysiTpiSee0P/+9z917NhRd9xxh9q3b6+ePXs6XfY+ceJEde3aVTVq1FDdunXVoUMHPfroo6pfv/416zx69KiaNGmSo7127dqO/hvdF0ePHlW5cuVUvHhxp/Yr39eDBw/KGKOxY8dq7NixuS4rOTlZd9xxh+P5lfu1ZMmSkuTYr9nvc5s2bXJdXvZnMluxYsVUvnz5HOOOHTumcePG6YsvvshxfktaWlquy/6rAwcOKC0tTYGBgbn2Z7+32Z/p6tWrO/WXLVvWsW3Xs2TJEr300kvatm2b07l2uYX3vHwup06dqujoaFWoUEGNGjVSp06d1KdPH4WGhjrGtGjRQuPHj9cff/yhtWvXqly5crrrrrvUoEEDrV27Vu3atdP333+vnj17Ol6T132SLbfv94MPPqh33nlH/fv31+jRo9W2bVvdf//96tGjBxdNFDEEIhQpdrtdISEh2rlz5w29Lq+zMNe64uTKvuz/m3z11VfVsGHDXF9TokQJpaSk5K3I63B3d8+13RjjVFO7du00atSoXMfWqFFDkhQYGKht27Zp2bJl+uabb/TNN99o3rx56tOnjxYsWCBJatmypQ4dOqTPP/9cy5cv1zvvvKPXX39dc+bMUf/+/fNlm/Jb9nsycuTIq870XRmirrdfs5f5/vvvKzg4OMe4YsWc/5n18vLK8Yc0MzNT7dq1U0pKip555hnVqlVLvr6+OnHihPr27ZunmYmsrCwFBgbqgw8+yLX/aiH4Rq1du1b/+Mc/1LJlS7355psqV66cPDw8NG/ePC1cuDDH+Lx8Lnv27KkWLVro008/1fLly/Xqq6/qlVde0SeffKKOHTtKkpo3b65Lly4pISFBa9eudcyGtmjRQmvXrtXevXt1+vRpp1nSG90nuX2/fXx8tGbNGq1atUpfffWVli5dqo8++kht2rTR8uXLr7p9uP0QiFDkdOnSRXPnzlVCQoIiIiKuObZSpUrKysrSgQMHHDMU0p93SE5NTVWlSpVuuo7swwR2u12RkZFXHVe2bFnZ7fbrhri8hrbr1XT27Nlr1pPN09NT9913n+677z5lZWXpiSee0FtvvaWxY8c6QkOpUqX02GOP6bHHHtPZs2fVsmVLjR8//pqBqFKlStq3b1+O9r179zr6b1SlSpW0atUqnT9/3mmW6ODBg07jsmccPDw88rQP8iL7fQ4MDLzpZe7YsUP79+/XggUL1KdPH0d7bocsr/Y5qFq1qr799ls1a9bsmsE9e/8eOHDAaQbm9OnTOWamcrN48WJ5e3tr2bJl8vLycrTPmzfvuq+9lnLlyumJJ57QE088oeTkZN111116+eWXHYGocePG8vT01Nq1a7V27Vo9/fTTkv4M5m+//bbi4+Mdz7PldZ9cj5ubm9q2bau2bdtq2rRpmjRpkp577jmtWrUq3z5HcD3m+1DkjBo1Sr6+vurfv7+SkpJy9B86dMhx+XinTp0kKcdPQkybNk2S1Llz55uuo1GjRqpatar+9a9/6ezZszn6sy/ddnNzU7du3fTll19q8+bNOcZl/5909r1q/nrp8Y3q2bOnEhIStGzZshx9qampunz5sqQ/L2H/Kzc3N8ehsOxDJFeOKVGihKpVq5bjdgVX6tSpkzZu3Og470P685yZuXPnqnLlygoLC7vh7YqKitKlS5f09ttvO9qysrIcl6hnCwwM1L333qu33npLp06dyrGc3C6nz8u67Xa7Jk2apEuXLt3UMrNnGf46a2KMcbrNQbarfQ569uypzMxMvfjiizlec/nyZcf4yMhIeXh4aObMmU7ry+vPori7u8tmsykzM9PRduTIEX322Wd5ev2VMjMzcxwSDAwMVEhIiNNnydvbW3fffbf++9//6tixY04zRH/88YdmzJihqlWrOt1qI6/75Fpym8HNnvG93mcdtxdmiFDkVK1aVQsXLtSDDz6o2rVrO92pev369Vq0aJHj95oaNGig6OhozZ07V6mpqWrVqpU2btyoBQsWqFu3bmrduvVN1+Hm5qZ33nlHHTt2VJ06dfTYY4/pjjvu0IkTJ7Rq1SrZ7XZ9+eWXkqRJkyZp+fLlatWqlQYOHKjatWvr1KlTWrRokb7//nsFBASoYcOGcnd31yuvvKK0tDR5eXmpTZs2Vz0/IjdPP/20vvjiC3Xp0kV9+/ZVo0aNdO7cOe3YsUMff/yxjhw5ojJlyqh///5KSUlRmzZtVL58eR09elQzZ85Uw4YNHTNpYWFhuvfee9WoUSOVKlVKmzdv1scff6zY2Nhr1jB69Gj997//VceOHfXkk0+qVKlSWrBggQ4fPqzFixff1HkZ3bp1U+PGjTVixAgdPHhQtWrV0hdffOH4Y/bXWZW4uDg1b95c9erV04ABAxQaGqqkpCQlJCTol19+0fbt229o3Xa7XbNnz9ajjz6qu+66S7169VLZsmV17NgxffXVV2rWrJlmzZp1zWXUqlVLVatW1ciRI3XixAnZ7XYtXrw41xmbRo0aSZKefPJJRUVFyd3dXb169VKrVq00aNAgTZ48Wdu2bVP79u3l4eGhAwcOaNGiRXrjjTfUo0cPlS1bViNHjtTkyZPVpUsXderUST/++KO++eYblSlT5rrb27lzZ02bNk0dOnTQww8/rOTkZMXFxalatWr66aefbmjfSX+e71e+fHn16NFDDRo0UIkSJfTtt99q06ZNeu2115zGtmjRQlOmTJG/v7/q1asn6c/wVLNmTe3bty/H77DldZ9cy8SJE7VmzRp17txZlSpVUnJyst58802VL1/e6cIAFAGuurwNuNX2799vBgwYYCpXrmw8PT2Nn5+fadasmZk5c6bTJdeXLl0yEyZMMFWqVDEeHh6mQoUKZsyYMU5jjPnzstzcLjPPvux+0aJFudbx448/mvvvv9+ULl3aeHl5mUqVKpmePXua+Ph4p3FHjx41ffr0MWXLljVeXl4mNDTUxMTEmIsXLzrGvP322yY0NNS4u7s7XYJ/tdpatWplWrVq5dR25swZM2bMGFOtWjXj6elpypQpY5o2bWr+9a9/Oe7L8vHHH5v27dubwMBA4+npaSpWrGgGDRpkTp065VjOSy+9ZBo3bmwCAgKMj4+PqVWrlnn55Zed7u1yNYcOHTI9evQwAQEBxtvb2zRu3DjHPWyMyftl98YYc/r0afPwww8bPz8/4+/vb/r27WvWrVtnJJkPP/wwx/r79OljgoODjYeHh7njjjtMly5dzMcff+wYk33Z/ZW3Qsh+v6+8/cGqVatMVFSU8ff3N97e3qZq1aqmb9++ZvPmzY4x0dHRxtfXN9f6d+/ebSIjI02JEiVMmTJlzIABAxyXqP/1tgGXL182Q4YMMWXLljU2my3HJfhz5841jRo1Mj4+PsbPz8/Uq1fPjBo1ypw8edIxJjMz00yYMMGUK1fO+Pj4mHvvvdfs3LnTVKpUKU+X3b/77rumevXqxsvLy9SqVcvMmzfPvPDCCzlqudr799f1XLx40Tz99NOmQYMGxs/Pz/j6+poGDRqYN998M8frvvrqKyPJdOzY0am9f//+RpJ59913c603L/vkat+h+Ph407VrVxMSEmI8PT1NSEiIeeihh3LcugK3P5sxf5kzBYAi5LPPPtM///lPff/992rWrJmrywFQiBGIABQJf/zxh9OJs5mZmWrfvr02b96sxMREfpMKwDVxDhGAImHIkCH6448/FBERoYsXL+qTTz7R+vXrNWnSJMIQgOtihghAkbBw4UK99tprOnjwoC5cuKBq1app8ODB1z3JGwAkAhEAAAD3IQIAACAQAQAAy+Ok6jzIysrSyZMn5efnly8/nwAAAG49Y4zOnDmjkJCQ6970lUCUBydPnlSFChVcXQYAALgJx48fV/ny5a85hkCUB35+fpL+3KF2u93F1QAAgLxIT09XhQoVHH/Hr4VAlAfZh8nsdjuBCACA20xeTnfhpGoAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5BCIAAGB5xVxdAAq3yqO/cnUJKEBHpnR2dQkA4BLMEAEAAMsjEAEAAMsjEAEAAMtzaSBas2aN7rvvPoWEhMhms+mzzz5z6jfGaNy4cSpXrpx8fHwUGRmpAwcOOI1JSUlR7969ZbfbFRAQoH79+uns2bNOY3766Se1aNFC3t7eqlChgqZOnXqrNw0AANxGXBqIzp07pwYNGiguLi7X/qlTp2rGjBmaM2eONmzYIF9fX0VFRenChQuOMb1799auXbu0YsUKLVmyRGvWrNHAgQMd/enp6Wrfvr0qVaqkLVu26NVXX9X48eM1d+7cW759AADg9mAzxhhXFyFJNptNn376qbp16ybpz9mhkJAQjRgxQiNHjpQkpaWlKSgoSPPnz1evXr20Z88ehYWFadOmTQoPD5ckLV26VJ06ddIvv/yikJAQzZ49W88995wSExPl6ekpSRo9erQ+++wz7d27N0+1paeny9/fX2lpabLb7fm/8YUYV5lZC1eZAShKbuTvd6E9h+jw4cNKTExUZGSko83f319NmjRRQkKCJCkhIUEBAQGOMCRJkZGRcnNz04YNGxxjWrZs6QhDkhQVFaV9+/bp999/z3XdFy9eVHp6utMDAAAUXYU2ECUmJkqSgoKCnNqDgoIcfYmJiQoMDHTqL1asmEqVKuU0Jrdl/HUdV5o8ebL8/f0djwoVKvz9DQIAAIVWoQ1ErjRmzBilpaU5HsePH3d1SQAA4BYqtIEoODhYkpSUlOTUnpSU5OgLDg5WcnKyU//ly5eVkpLiNCa3Zfx1HVfy8vKS3W53egAAgKKr0AaiKlWqKDg4WPHx8Y629PR0bdiwQREREZKkiIgIpaamasuWLY4xK1euVFZWlpo0aeIYs2bNGl26dMkxZsWKFapZs6ZKlixZQFsDAAAKM5cGorNnz2rbtm3atm2bpD9PpN62bZuOHTsmm82moUOH6qWXXtIXX3yhHTt2qE+fPgoJCXFciVa7dm116NBBAwYM0MaNG7Vu3TrFxsaqV69eCgkJkSQ9/PDD8vT0VL9+/bRr1y599NFHeuONNzR8+HAXbTUAAChsXPrjrps3b1br1q0dz7NDSnR0tObPn69Ro0bp3LlzGjhwoFJTU9W8eXMtXbpU3t7ejtd88MEHio2NVdu2beXm5qbu3btrxowZjn5/f38tX75cMTExatSokcqUKaNx48Y53asIAABYW6G5D1Fhxn2IYBXchwhAUVIk7kMEAABQUAhEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8ghEAADA8gp1IMrMzNTYsWNVpUoV+fj4qGrVqnrxxRdljHGMMcZo3LhxKleunHx8fBQZGakDBw44LSclJUW9e/eW3W5XQECA+vXrp7Nnzxb05gAAgEKqUAeiV155RbNnz9asWbO0Z88evfLKK5o6dapmzpzpGDN16lTNmDFDc+bM0YYNG+Tr66uoqChduHDBMaZ3797atWuXVqxYoSVLlmjNmjUaOHCgKzYJAAAUQjbz1+mWQqZLly4KCgrSu+++62jr3r27fHx89J///EfGGIWEhGjEiBEaOXKkJCktLU1BQUGaP3++evXqpT179igsLEybNm1SeHi4JGnp0qXq1KmTfvnlF4WEhFy3jvT0dPn7+ystLU12u/3WbGwhVXn0V64uAQXoyJTOri4BAPLNjfz9LtQzRE2bNlV8fLz2798vSdq+fbu+//57dezYUZJ0+PBhJSYmKjIy0vEaf39/NWnSRAkJCZKkhIQEBQQEOMKQJEVGRsrNzU0bNmzIdb0XL15Uenq60wMAABRdxVxdwLWMHj1a6enpqlWrltzd3ZWZmamXX35ZvXv3liQlJiZKkoKCgpxeFxQU5OhLTExUYGCgU3+xYsVUqlQpx5grTZ48WRMmTMjvzQEAAIVUoZ4h+t///qcPPvhACxcu1NatW7VgwQL961//0oIFC27peseMGaO0tDTH4/jx47d0fQAAwLUK9QzR008/rdGjR6tXr16SpHr16uno0aOaPHmyoqOjFRwcLElKSkpSuXLlHK9LSkpSw4YNJUnBwcFKTk52Wu7ly5eVkpLieP2VvLy85OXldQu2CAAAFEaFeobo/PnzcnNzLtHd3V1ZWVmSpCpVqig4OFjx8fGO/vT0dG3YsEERERGSpIiICKWmpmrLli2OMStXrlRWVpaaNGlSAFsBAAAKu0I9Q3Tffffp5ZdfVsWKFVWnTh39+OOPmjZtmh5//HFJks1m09ChQ/XSSy+pevXqqlKlisaOHauQkBB169ZNklS7dm116NBBAwYM0Jw5c3Tp0iXFxsaqV69eebrCDAAAFH2FOhDNnDlTY8eO1RNPPKHk5GSFhIRo0KBBGjdunGPMqFGjdO7cOQ0cOFCpqalq3ry5li5dKm9vb8eYDz74QLGxsWrbtq3c3NzUvXt3zZgxwxWbBAAACqFCfR+iwoL7EMEquA8RgKKkyNyHCAAAoCAQiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOURiAAAgOUV+kB04sQJPfLIIypdurR8fHxUr149bd682dFvjNG4ceNUrlw5+fj4KDIyUgcOHHBaRkpKinr37i273a6AgAD169dPZ8+eLehNAQAAhVShDkS///67mjVrJg8PD33zzTfavXu3XnvtNZUsWdIxZurUqZoxY4bmzJmjDRs2yNfXV1FRUbpw4YJjTO/evbVr1y6tWLFCS5Ys0Zo1azRw4EBXbBIAACiEbMYY4+oirmb06NFat26d1q5dm2u/MUYhISEaMWKERo4cKUlKS0tTUFCQ5s+fr169emnPnj0KCwvTpk2bFB4eLklaunSpOnXqpF9++UUhISHXrSM9PV3+/v5KS0uT3W7Pvw28DVQe/ZWrS0ABOjKls6tLAIB8cyN/vwv1DNEXX3yh8PBwPfDAAwoMDNSdd96pt99+29F/+PBhJSYmKjIy0tHm7++vJk2aKCEhQZKUkJCggIAARxiSpMjISLm5uWnDhg0FtzEAAKDQKtSB6Oeff9bs2bNVvXp1LVu2TIMHD9aTTz6pBQsWSJISExMlSUFBQU6vCwoKcvQlJiYqMDDQqb9YsWIqVaqUY8yVLl68qPT0dKcHAAAouoq5uoBrycrKUnh4uCZNmiRJuvPOO7Vz507NmTNH0dHRt2y9kydP1oQJE27Z8gEAQOFSqGeIypUrp7CwMKe22rVr69ixY5Kk4OBgSVJSUpLTmKSkJEdfcHCwkpOTnfovX76slJQUx5grjRkzRmlpaY7H8ePH82V7AABA4VSoA1GzZs20b98+p7b9+/erUqVKkqQqVaooODhY8fHxjv709HRt2LBBERERkqSIiAilpqZqy5YtjjErV65UVlaWmjRpkut6vby8ZLfbnR4AAKDouqlAFBoaqt9++y1He2pqqkJDQ/92UdmGDRumH374QZMmTdLBgwe1cOFCzZ07VzExMZIkm82moUOH6qWXXtIXX3yhHTt2qE+fPgoJCVG3bt0k/Tmj1KFDBw0YMEAbN27UunXrFBsbq169euXpCjMAAFD03dQ5REeOHFFmZmaO9osXL+rEiRN/u6hsd999tz799FONGTNGEydOVJUqVTR9+nT17t3bMWbUqFE6d+6cBg4cqNTUVDVv3lxLly6Vt7e3Y8wHH3yg2NhYtW3bVm5uburevbtmzJiRb3UCAIDb2w3dh+iLL76QJHXr1k0LFiyQv7+/oy8zM1Px8fFasWJFjsNctzvuQwSr4D5EAIqSG/n7fUMzRNmHoWw2W46rvDw8PFS5cmW99tprN1YtAACAi91QIMrKypL058nMmzZtUpkyZW5JUQAAAAXpps4hOnz4cH7XAQAA4DI3fWPG+Ph4xcfHKzk52TFzlO3f//733y4MAACgoNxUIJowYYImTpyo8PBwlStXTjabLb/rAgAAKDA3FYjmzJmj+fPn69FHH83vegAAAArcTd2YMSMjQ02bNs3vWgAAAFzipgJR//79tXDhwvyuBQAAwCVu6pDZhQsXNHfuXH377beqX7++PDw8nPqnTZuWL8UBAAAUhJsKRD/99JMaNmwoSdq5c6dTHydYAwCA281NBaJVq1bldx0AAAAuc1PnEAEAABQlNzVD1Lp162seGlu5cuVNFwQAAFDQbioQZZ8/lO3SpUvatm2bdu7cmeNHXwEAAAq7mwpEr7/+eq7t48eP19mzZ/9WQQAAAAUtX88heuSRR/gdMwAAcNvJ10CUkJAgb2/v/FwkAADALXdTh8zuv/9+p+fGGJ06dUqbN2/W2LFj86UwAACAgnJTgcjf39/puZubm2rWrKmJEyeqffv2+VIYAABAQbmpQDRv3rz8rgMAAMBlbioQZduyZYv27NkjSapTp47uvPPOfCkKAACgIN1UIEpOTlavXr20evVqBQQESJJSU1PVunVrffjhhypbtmx+1ggAAHBL3dRVZkOGDNGZM2e0a9cupaSkKCUlRTt37lR6erqefPLJ/K4RAADglrqpGaKlS5fq22+/Ve3atR1tYWFhiouL46RqAABw27mpGaKsrCx5eHjkaPfw8FBWVtbfLgoAAKAg3VQgatOmjZ566imdPHnS0XbixAkNGzZMbdu2zbfiAAAACsJNBaJZs2YpPT1dlStXVtWqVVW1alVVqVJF6enpmjlzZn7XCAAAcEvd1DlEFSpU0NatW/Xtt99q7969kqTatWsrMjIyX4sDAAAoCDc0Q7Ry5UqFhYUpPT1dNptN7dq105AhQzRkyBDdfffdqlOnjtauXXuragUAALglbigQTZ8+XQMGDJDdbs/R5+/vr0GDBmnatGn5VhwAAEBBuKFAtH37dnXo0OGq/e3bt9eWLVv+dlEAAAAF6YYCUVJSUq6X22crVqyYTp8+/beLAgAAKEg3FIjuuOMO7dy586r9P/30k8qVK/e3iwIAAChINxSIOnXqpLFjx+rChQs5+v744w+98MIL6tKlS74VBwAAUBBu6LL7559/Xp988olq1Kih2NhY1axZU5K0d+9excXFKTMzU88999wtKRQAAOBWuaFAFBQUpPXr12vw4MEaM2aMjDGSJJvNpqioKMXFxSkoKOiWFAoAAHCr3PCNGStVqqSvv/5av//+uw4ePChjjKpXr66SJUveivoAAABuuZu6U7UklSxZUnfffXd+1gIAAOASN/VbZgAAAEUJgQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFgegQgAAFjebRWIpkyZIpvNpqFDhzraLly4oJiYGJUuXVolSpRQ9+7dlZSU5PS6Y8eOqXPnzipevLgCAwP19NNP6/LlywVcPQAAKKxum0C0adMmvfXWW6pfv75T+7Bhw/Tll19q0aJF+u6773Ty5Endf//9jv7MzEx17txZGRkZWr9+vRYsWKD58+dr3LhxBb0JAACgkLotAtHZs2fVu3dvvf322ypZsqSjPS0tTe+++66mTZumNm3aqFGjRpo3b57Wr1+vH374QZK0fPly7d69W//5z3/UsGFDdezYUS+++KLi4uKUkZHhqk0CAACFyG0RiGJiYtS5c2dFRkY6tW/ZskWXLl1yaq9Vq5YqVqyohIQESVJCQoLq1aunoKAgx5ioqCilp6dr165dua7v4sWLSk9Pd3oAAICiq5irC7ieDz/8UFu3btWmTZty9CUmJsrT01MBAQFO7UFBQUpMTHSM+WsYyu7P7svN5MmTNWHChHyoHgAA3A4K9QzR8ePH9dRTT+mDDz6Qt7d3ga13zJgxSktLczyOHz9eYOsGAAAFr1AHoi1btig5OVl33XWXihUrpmLFium7777TjBkzVKxYMQUFBSkjI0OpqalOr0tKSlJwcLAkKTg4OMdVZ9nPs8dcycvLS3a73ekBAACKrkIdiNq2basdO3Zo27Ztjkd4eLh69+7t+G8PDw/Fx8c7XrNv3z4dO3ZMERERkqSIiAjt2LFDycnJjjErVqyQ3W5XWFhYgW8TAAAofAr1OUR+fn6qW7euU5uvr69Kly7taO/Xr5+GDx+uUqVKyW63a8iQIYqIiNA999wjSWrfvr3CwsL06KOPaurUqUpMTNTzzz+vmJgYeXl5Ffg2AQCAwqdQB6K8eP311+Xm5qbu3bvr4sWLioqK0ptvvunod3d315IlSzR48GBFRETI19dX0dHRmjhxogurBgAAhYnNGGNcXURhl56eLn9/f6WlpVnufKLKo79ydQkoQEemdHZ1CQCQb27k73ehPocIAACgIBCIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RGIAACA5RXqQDR58mTdfffd8vPzU2BgoLp166Z9+/Y5jblw4YJiYmJUunRplShRQt27d1dSUpLTmGPHjqlz584qXry4AgMD9fTTT+vy5csFuSkAAKAQK9SB6LvvvlNMTIx++OEHrVixQpcuXVL79u117tw5x5hhw4bpyy+/1KJFi/Tdd9/p5MmTuv/++x39mZmZ6ty5szIyMrR+/XotWLBA8+fP17hx41yxSQAAoBCyGWOMq4vIq9OnTyswMFDfffedWrZsqbS0NJUtW1YLFy5Ujx49JEl79+5V7dq1lZCQoHvuuUfffPONunTpopMnTyooKEiSNGfOHD3zzDM6ffq0PD09r7ve9PR0+fv7Ky0tTXa7/ZZuY2FTefRXri4BBejIlM6uLgEA8s2N/P0u1DNEV0pLS5MklSpVSpK0ZcsWXbp0SZGRkY4xtWrVUsWKFZWQkCBJSkhIUL169RxhSJKioqKUnp6uXbt25bqeixcvKj093ekBAACKrtsmEGVlZWno0KFq1qyZ6tatK0lKTEyUp6enAgICnMYGBQUpMTHRMeavYSi7P7svN5MnT5a/v7/jUaFChXzeGgAAUJjcNoEoJiZGO3fu1IcffnjL1zVmzBilpaU5HsePH7/l6wQAAK5TzNUF5EVsbKyWLFmiNWvWqHz58o724OBgZWRkKDU11WmWKCkpScHBwY4xGzdudFpe9lVo2WOu5OXlJS8vr3zeCgAAUFgV6hkiY4xiY2P16aefauXKlapSpYpTf6NGjeTh4aH4+HhH2759+3Ts2DFFRERIkiIiIrRjxw4lJyc7xqxYsUJ2u11hYWEFsyEAAKBQK9QzRDExMVq4cKE+//xz+fn5Oc758ff3l4+Pj/z9/dWvXz8NHz5cpUqVkt1u15AhQxQREaF77rlHktS+fXuFhYXp0Ucf1dSpU5WYmKjnn39eMTExzAIBAABJhTwQzZ49W5J07733OrXPmzdPffv2lSS9/vrrcnNzU/fu3XXx4kVFRUXpzTffdIx1d3fXkiVLNHjwYEVERMjX11fR0dGaOHFiQW0GAAAo5G6r+xC5CvchglVwHyIARUmRvQ8RAADArUAgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAlkcgAgAAllfM1QUAAFyj8uivXF0CCtCRKZ1dXUKhxgwRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPAIRAACwPEsFori4OFWuXFne3t5q0qSJNm7c6OqSAABAIWCZQPTRRx9p+PDheuGFF7R161Y1aNBAUVFRSk5OdnVpAADAxSwTiKZNm6YBAwboscceU1hYmObMmaPixYvr3//+t6tLAwAALmaJQJSRkaEtW7YoMjLS0ebm5qbIyEglJCS4sDIAAFAYFHN1AQXh119/VWZmpoKCgpzag4KCtHfv3hzjL168qIsXLzqep6WlSZLS09NvbaGFUNbF864uAQXIip9xK+P7bS1W/H5nb7Mx5rpjLRGIbtTkyZM1YcKEHO0VKlRwQTVAwfGf7uoKANwqVv5+nzlzRv7+/tccY4lAVKZMGbm7uyspKcmpPSkpScHBwTnGjxkzRsOHD3c8z8rKUkpKikqXLi2bzXbL64Vrpaenq0KFCjp+/LjsdrurywGQj/h+W4sxRmfOnFFISMh1x1oiEHl6eqpRo0aKj49Xt27dJP0ZcuLj4xUbG5tjvJeXl7y8vJzaAgICCqBSFCZ2u51/MIEiiu+3dVxvZiibJQKRJA0fPlzR0dEKDw9X48aNNX36dJ07d06PPfaYq0sDAAAuZplA9OCDD+r06dMaN26cEhMT1bBhQy1dujTHidYAAMB6LBOIJCk2NjbXQ2TAX3l5eemFF17IcdgUwO2P7zeuxmbyci0aAABAEWaJGzMCAABcC4EIAABYHoEIAABYHoEIAABYHoEIkLR27Vo98sgjioiI0IkTJyRJ77//vr7//nsXVwYAKAgEIlje4sWLFRUVJR8fH/3444+OH/ZNS0vTpEmTXFwdAKAgEIhgeS+99JLmzJmjt99+Wx4eHo72Zs2aaevWrS6sDEB+y8jI0L59+3T58mVXl4JChkAEy9u3b59atmyZo93f31+pqakFXxCAfHf+/Hn169dPxYsXV506dXTs2DFJ0pAhQzRlyhQXV4fCgEAEywsODtbBgwdztH///fcKDQ11QUUA8tuYMWO0fft2rV69Wt7e3o72yMhIffTRRy6sDIUFgQiWN2DAAD311FPasGGDbDabTp48qQ8++EAjR47U4MGDXV0egHzw2WefadasWWrevLlsNpujvU6dOjp06JALK0NhYanfMgNyM3r0aGVlZalt27Y6f/68WrZsKS8vL40cOVJDhgxxdXkA8sHp06cVGBiYo/3cuXNOAQnWxQwRLM9ms+m5555TSkqKdu7cqR9++EGnT5/Wiy++6OrSAOST8PBwffXVV47n2SHonXfeUUREhKvKQiHCDBHw/3l6eiosLMzVZQC4BSZNmqSOHTtq9+7dunz5st544w3t3r1b69ev13fffefq8lAI8Gv3sLzWrVtfc8p85cqVBVgNgFvl0KFDmjJlirZv366zZ8/qrrvu0jPPPKN69eq5ujQUAswQwfIaNmzo9PzSpUvatm2bdu7cqejoaNcUBSDfVa1aVW+//bary0AhRSCC5b3++uu5to8fP15nz54t4GoA3Apbt26Vh4eHYzbo888/17x58xQWFqbx48fL09PTxRXC1TipGriKRx55RP/+979dXQaAfDBo0CDt379fkvTzzz/rwQcfVPHixbVo0SKNGjXKxdWhMCAQAVeRkJDgdAM3ALev/fv3Ow6PL1q0SK1atdLChQs1f/58LV682LXFoVDgkBks7/7773d6bozRqVOntHnzZo0dO9ZFVQHIT8YYZWVlSZK+/fZbdenSRZJUoUIF/frrr64sDYUEgQiW5+/v7/Tczc1NNWvW1MSJE9W+fXsXVQUgP4WHh+ull15SZGSkvvvuO82ePVuSdPjwYQUFBbm4OhQGBCJYWmZmph577DHVq1dPJUuWdHU5AG6R6dOnq3fv3vrss8/03HPPqVq1apKkjz/+WE2bNnVxdSgMuA8RLM/b21t79uxRlSpVXF0KgAJ24cIFubu7y8PDw9WlwMU4qRqWV7duXf3888+uLgOAC3h7exOGIIkZIkBLly7VmDFj9OKLL6pRo0by9fV16rfb7S6qDMDfUbJkyTz/cGtKSsotrgaFHYEIljVx4kSNGDFCfn5+jra//uNpjJHNZlNmZqYrygPwNy1YsCDPY7krPQhEsCx3d3edOnVKe/bsuea4Vq1aFVBFAABXIRDBstzc3JSYmKjAwEBXlwKgAF24cEEZGRlObRwaBydVw9Lyen4BgNvbuXPnFBsbq8DAQPn6+qpkyZJOD4D7EMHSatSocd1QxMmWwO1v1KhRWrVqlWbPnq1HH31UcXFxOnHihN566y1NmTLF1eWhEOCQGSzLzc1N06dPz3Gn6itxsiVw+6tYsaLee+893XvvvbLb7dq6dauqVaum999/X//973/19ddfu7pEuBgzRLC0Xr16cQ4RYAEpKSkKDQ2V9Of5Qtkzv82bN9fgwYNdWRoKCc4hgmVx/hBgHaGhoTp8+LAkqVatWvrf//4nSfryyy8VEBDgwspQWBCIYFkcLQaKvp9//llZWVl67LHHtH37dknS6NGjFRcXJ29vbw0bNkxPP/20i6tEYcA5RACAIiv7fmPZh8YffPBBzZgxQxcuXNCWLVtUrVo11a9f38VVojAgEAEAiqwr7zfm5+en7du3O84nArJxyAwAAFgegQgAUGTZbLYcF1BwQQVyw2X3AIAiyxijvn37ysvLS9KfP9vxf//3f/L19XUa98knn7iiPBQiBCIAQJF15Y1VH3nkERdVgsKOk6oBAIDlcQ4RAACwPAIRAACwPAIRAACwPAIRAACwPAIRgAKTmJioIUOGKDQ0VF5eXqpQoYLuu+8+xcfHu7q0HObPn8+PfgIWwmX3AArEkSNH1KxZMwUEBOjVV19VvXr1dOnSJS1btkwxMTHau3fvDS8zIyNDnp6eOdovXbokDw+P/CgbgEUwQwSgQDzxxBOy2WzauHGjunfvrho1aqhOnToaPny4fvjhB0nSsWPH1LVrV5UoUUJ2u109e/ZUUlKSYxnjx49Xw4YN9c4776hKlSry9vaW9Oedh2fPnq1//OMf8vX11csvvyxJ+vzzz3XXXXfJ29tboaGhmjBhgi5fvuxYXmpqqgYNGqSgoCB5e3urbt26WrJkiVavXq3HHntMaWlpjjsdjx8/XpJUuXJlTZo0SY8//rj8/PxUsWJFzZ0712lbjx8/rp49eyogIEClSpVS165ddeTIEUf/6tWr1bhxY/n6+iogIEDNmjXT0aNHJUnbt29X69at5efnJ7vdrkaNGmnz5s35/n4AcEYgAnDLpaSkaOnSpYqJiclxh2BJCggIUFZWlrp27aqUlBR99913WrFihX7++Wc9+OCDTmMPHjyoxYsX65NPPtG2bdsc7ePHj9c///lP7dixQ48//rjWrl2rPn366KmnntLu3bv11ltvaf78+Y6wlJWVpY4dO2rdunX6z3/+o927d2vKlClyd3dX06ZNNX36dNntdp06dUqnTp3SyJEjHet67bXXFB4erh9//FFPPPGEBg8erH379kn6c3YqKipKfn5+Wrt2rdatW6cSJUqoQ4cOysjI0OXLl9WtWze1atVKP/30kxISEjRw4EDHz0n07t1b5cuX16ZNm7RlyxaNHj2a2S6gIBgAuMU2bNhgJJlPPvnkqmOWL19u3N3dzbFjxxxtu3btMpLMxo0bjTHGvPDCC8bDw8MkJyc7vVaSGTp0qFNb27ZtzaRJk5za3n//fVOuXDljjDHLli0zbm5uZt++fbnWM2/ePOPv75+jvVKlSuaRRx5xPM/KyjKBgYFm9uzZjnXUrFnTZGVlOcZcvHjR+Pj4mGXLlpnffvvNSDKrV6/Odb1+fn5m/vz5ufYBuHWYIQJwy5k83BB/z549qlChgipUqOBoCwsLU0BAgPbs2eNoq1SpksqWLZvj9eHh4U7Pt2/frokTJ6pEiRKOx4ABA3Tq1CmdP39e27ZtU/ny5VWjRo0b3p769es7/ttmsyk4OFjJycmO9R48eFB+fn6O9ZYqVUoXLlzQoUOHVKpUKfXt21dRUVG677779MYbb+jUqVOO5Q0fPlz9+/dXZGSkpkyZokOHDt1wfQBuHIEIwC1XvXp12Wy2mzpx+kq5HXLLrf3s2bOaMGGCtm3b5njs2LFDBw4ckLe3t3x8fG66hisPYdlsNmVlZTnW26hRI6f1btu2Tfv379fDDz8sSZo3b54SEhLUtGlTffTRR6pRo4bjPKrx48dr165d6ty5s1auXKmwsDB9+umnN10rgLwhEAG45UqVKqWoqCjFxcXp3LlzOfpTU1NVu3ZtHT9+XMePH3e07969W6mpqQoLC7vhdd51113at2+fqlWrluPh5uam+vXr65dfftH+/ftzfb2np6cyMzNvar0HDhxQYGBgjvX6+/s7xt15550aM2aM1q9fr7p162rhwoWOvho1amjYsGFavny57r//fs2bN++G6wBwYwhEAApEXFycMjMz1bhxYy1evFgHDhzQnj17NGPGDEVERCgyMlL16tVT7969tXXrVm3cuFF9+vRRq1atchwOy4tx48bpvffe04QJE7Rr1y7t2bNHH374oZ5//nlJUqtWrdSyZUt1795dK1as0OHDh/XNN99o6dKlkv68muzs2bOKj4/Xr7/+qvPnz+dpvb1791aZMmXUtWtXrV27VocPH9bq1av15JNP6pdfftHhw4c1ZswYJSQk6OjRo1q+fLkOHDig2rVr648//lBsbKxWr16to0ePat26ddq0aZNq1659w9sP4MYQiAAUiNDQUG3dulWtW7fWiBEjVLduXbVr107x8fGaPXu2bDabPv/8c5UsWVItW7ZUZGSkQkND9dFHH93U+qKiorRkyRItX75cd999t+655x69/vrrqlSpkmPM4sWLdffdd+uhhx5SWFiYRo0a5ZgVatq0qf7v//5PDz74oMqWLaupU6fmab3FixfXmjVrVLFiRd1///2qXbu2+vXrpwsXLshut6t48eLau3ev49YDAwcOVExMjAYNGiR3d3f99ttv6tOnj2rUqKGePXuqY8eOmjBhwk3tAwB5ZzN5OdsRAACgCGOGCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWB6BCAAAWN7/A6EcqXbvVlAVAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Process the log events generated by oaieval\n",
"events = \"./jcqa_gpt_output/gpt-3.5-turbo_match_jcqa-1shot.jsonl\"\n",
"\n",
"with open(events, \"r\") as f:\n",
" events_df = pd.read_json(f, lines=True)\n",
"\n",
"# There are two types of events in the events df:\n",
"\n",
"# sampling events\n",
"sampling_df = events_df[events_df.type == \"sampling\"].reset_index(drop=True)\n",
"sampling_df = sampling_df.join(pd.json_normalize(sampling_df.data))\n",
"sampling_df = sampling_df.drop(\n",
" [\n",
" \"spec\",\n",
" \"run_id\",\n",
" \"event_id\",\n",
" \"type\",\n",
" \"created_by\",\n",
" \"created_at\",\n",
" \"data\",\n",
" \"final_report\",\n",
" \"sampled\",\n",
" ],\n",
" axis=1,\n",
")\n",
"\n",
"# match events\n",
"match_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
"match_df = match_df.join(pd.json_normalize(match_df.data))\n",
"match_df = match_df.drop(\n",
" [\n",
" \"spec\",\n",
" \"run_id\",\n",
" \"event_id\",\n",
" \"type\",\n",
" \"created_by\",\n",
" \"final_report\",\n",
" \"created_at\",\n",
" \"data\",\n",
" ],\n",
" axis=1,\n",
")\n",
"\n",
"# Merge them for convenience\n",
"df = sampling_df.merge(match_df, on=\"sample_id\")\n",
"\n",
"df.correct.value_counts().plot.bar(\n",
" title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1b3fd6f4-e300-48f0-8cb6-b4a95bb3a671",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.916\n"
]
}
],
"source": [
"print(f'Accuracy: {len(df[df[\"correct\"]]) / len(df):.3f}')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "43d934e2-0e0a-428c-b2f0-787e0e125d78",
"metadata": {},
"outputs": [],
"source": [
"# To estimate GPT4 results without API access we check the questions that GPT-3 failed by hand\n",
"# So generate a GPT4 results csv file here which we will fill in later\n",
"\n",
"failed_df = df[~df[\"correct\"]]\n",
"gpt4_df = pd.DataFrame()\n",
"gpt4_df[\"sample_id\"] = failed_df[\"sample_id\"]\n",
"gpt4_df[\"prompt\"] = failed_df.apply(\n",
" lambda row: \"\\n\".join([d[\"content\"] for d in row[\"prompt\"]]), axis=1\n",
")\n",
"gpt4_df[\"expected\"] = failed_df.apply(lambda row: row[\"expected\"][0], axis=1)\n",
"gpt4_df[\"gpt-3\"] = failed_df[\"sampled\"]\n",
"gpt4_df[\"gpt-4\"] = -1\n",
"gpt4_df.to_csv(\"./jcqa_gpt_output/gpt-4-test.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bb9aa100-f9b2-4c63-8da3-17674ff3a5ab",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Now loop through this df, copy+paste by hand the prompt into chat-gpt-4, and input answer\n",
"while True:\n",
" input_df = pd.read_csv(\"./jcqa_gpt_output/gpt-4-test.csv\")\n",
" unfilled = input_df[input_df[\"gpt-4\"].astype(\"str\") == \"-1\"]\n",
" if len(unfilled) == 0:\n",
" break\n",
" print(unfilled[\"prompt\"].iloc[0])\n",
" result = input()\n",
" input_df.iloc[unfilled.index[0], -1] = result\n",
" input_df.to_csv(\"./jcqa_gpt_output/gpt-4-test.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "23e6e683-da68-4e7c-b11a-ae07107ef2d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimated GPT4 accuracy 0.982\n"
]
}
],
"source": [
"## This code now computes the estimated accuracy of GPT-4\n",
"n_tot = len(df)\n",
"gpt4df = pd.read_csv(\"./jcqa_gpt_output/gpt-4.csv\")\n",
"gpt4df[\"expected\"] = gpt4df[\"expected\"].astype(\"str\")\n",
"n_wrong = len(gpt4df[gpt4df[\"expected\"] != gpt4df[\"gpt-4\"]])\n",
"print(f\"Estimated GPT4 accuracy: {(n_tot-n_wrong)/n_tot:.3f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python3.10.8",
"language": "python",
"name": "python3.10.8"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment