Skip to content

Instantly share code, notes, and snippets.

@gauss5930
Last active April 25, 2025 01:47
Show Gist options
  • Save gauss5930/2e7b1940a356cc696170b44c38fdd82c to your computer and use it in GitHub Desktop.
Save gauss5930/2e7b1940a356cc696170b44c38fdd82c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## HyperCLOVAX-Seed-1.5B-Instruct 평가"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# Ensure V1 is enabled by default; to disable set VLLM_USE_V1=0 in your env\n",
"os.environ.setdefault(\"VLLM_USE_V1\", \"1\")\n",
"\n",
"print(\"\"\"\n",
"V1 is now enabled by default for all supported use cases, and we will gradually enable it\n",
"for every use case we plan to support. Please share any feedback on GitHub or in the vLLM Slack.\n",
"\n",
"To disable V1, please set the environment variable as: VLLM_USE_V1=0, and send us a GitHub issue sharing the reason!\n",
"\"\"\".strip())\n",
"\n",
"from vllm import LLM, SamplingParams\n",
"import re\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"# Initialize model\n",
"llm = LLM(model=\"naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-1.5B\")\n",
"\n",
"# Deterministic sampling\n",
"sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=10) # max_tokens 조정: [10, 20, 40, 80]\n",
"\n",
"# Use numbers 1–100\n",
"digits = list(range(1, 101))\n",
"pairs = [(a, b) for a in digits for b in digits]\n",
"\n",
"# Build prompts\n",
"prompts_en = [f\"What is {a} × {b}? Please answer in \\\\boxed{{}} format.\" for a, b in pairs]\n",
"prompts_ko = [f\"{a} 곱하기 {b} 는 무엇입니까? \\\\boxed{{}} 형식으로 대답해주세요.\" for a, b in pairs]\n",
"\n",
"def build_inputs(prompts):\n",
" tok = llm.get_tokenizer()\n",
" system_msg = {\n",
" \"role\": \"system\",\n",
" \"content\": \"- AI 언어모델의 이름은 \\\"CLOVA X\\\" 이며 네이버에서 만들었다.\\n\"\n",
" \"- 오늘은 2025년 04월 24일(목)이다.\"\n",
" }\n",
" inputs = []\n",
" for p in prompts:\n",
" chat = [\n",
" {\"role\": \"tool_list\", \"content\": \"\"},\n",
" system_msg,\n",
" {\"role\": \"user\", \"content\": p},\n",
" ]\n",
" inputs.append(tok.apply_chat_template(chat, add_generation_prompt=True, tokenize=False))\n",
" return inputs\n",
"\n",
"# Generate\n",
"outputs_en = llm.generate(build_inputs(prompts_en), sampling_params)\n",
"outputs_ko = llm.generate(build_inputs(prompts_ko), sampling_params)\n",
"\n",
"# Compute accuracy matrices\n",
"def compute_acc(outputs):\n",
" acc = np.zeros((100, 100), dtype=int)\n",
" for idx, out in enumerate(outputs):\n",
" a, b = pairs[idx]\n",
" m = re.search(r'\\\\boxed\\{(\\d+)\\}', out.outputs[0].text)\n",
" if m and int(m.group(1)) == a * b:\n",
" acc[digits.index(a), digits.index(b)] = 1\n",
" return acc\n",
"\n",
"acc_en = compute_acc(outputs_en)\n",
"acc_ko = compute_acc(outputs_ko)\n",
"\n",
"# Print overall results\n",
"total = acc_en.size\n",
"correct_en = acc_en.sum()\n",
"correct_ko = acc_ko.sum()\n",
"print(f\"English prompts: {correct_en}/{total} correct ({correct_en/total:.2%})\")\n",
"print(f\"Korean prompts: {correct_ko}/{total} correct ({correct_ko/total:.2%})\")\n",
"\n",
"# Plot heatmaps without per-cell numbers, just green=1/red=0\n",
"cmap = ListedColormap(['red', 'green'])\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 6))\n",
"\n",
"# Show ticks only every 10th to reduce clutter\n",
"ticks = list(range(0, 100, 10))\n",
"labels = [digits[i] for i in ticks]\n",
"\n",
"for ax, acc, title in zip(axes, [acc_en, acc_ko], [\"English Prompt\", \"Korean Prompt\"]):\n",
" ax.imshow(acc, aspect='equal', cmap=cmap, vmin=0, vmax=1)\n",
" ax.set_xticks(ticks)\n",
" ax.set_xticklabels(labels)\n",
" ax.set_yticks(ticks)\n",
" ax.set_yticklabels(labels)\n",
" ax.set_xlabel('Multiplier (b)')\n",
" ax.set_ylabel('Multiplicand (a)')\n",
" ax.set_title(title)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"del llm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Qwen2.5-1.5B-Instruct 평가"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# Ensure V1 is enabled by default; to disable set VLLM_USE_V1=0 in your env\n",
"os.environ.setdefault(\"VLLM_USE_V1\", \"1\")\n",
"\n",
"print(\"\"\"\n",
"V1 is now enabled by default for all supported use cases, and we will gradually enable it\n",
"for every use case we plan to support. Please share any feedback on GitHub or in the vLLM Slack.\n",
"\n",
"To disable V1, please set the environment variable as: VLLM_USE_V1=0, and send us a GitHub issue sharing the reason!\n",
"\"\"\".strip())\n",
"\n",
"from vllm import LLM, SamplingParams\n",
"import re\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"# Initialize model\n",
"llm = LLM(model=\"Qwen/Qwen2.5-1.5B-Instruct\")\n",
"\n",
"# Deterministic sampling\n",
"sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=20) # max_tokens 조정: [10, 20, 40, 80]\n",
"\n",
"# Use numbers 1–100\n",
"digits = list(range(1, 101))\n",
"pairs = [(a, b) for a in digits for b in digits]\n",
"\n",
"# Build prompts\n",
"prompts_en = [f\"What is {a} × {b}? Please answer in \\\\boxed{{}} format.\" for a, b in pairs]\n",
"prompts_ko = [f\"{a} 곱하기 {b} 는 무엇입니까? \\\\boxed{{}} 형식으로 대답해주세요.\" for a, b in pairs]\n",
"\n",
"def build_inputs(prompts):\n",
" tok = llm.get_tokenizer()\n",
" system_msg = {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"\n",
" }\n",
" inputs = []\n",
" for p in prompts:\n",
" chat = [\n",
" {\"role\": \"tool_list\", \"content\": \"\"},\n",
" system_msg,\n",
" {\"role\": \"user\", \"content\": p},\n",
" ]\n",
" inputs.append(tok.apply_chat_template(chat, add_generation_prompt=True, tokenize=False))\n",
" return inputs\n",
"\n",
"# Generate\n",
"outputs_en = llm.generate(build_inputs(prompts_en), sampling_params)\n",
"outputs_ko = llm.generate(build_inputs(prompts_ko), sampling_params)\n",
"\n",
"# Compute accuracy matrices\n",
"def compute_acc(outputs):\n",
" acc = np.zeros((100, 100), dtype=int)\n",
" for idx, out in enumerate(outputs):\n",
" a, b = pairs[idx]\n",
" m = re.search(r'\\\\boxed\\{(\\d+)\\}', out.outputs[0].text)\n",
" if m and int(m.group(1)) == a * b:\n",
" acc[digits.index(a), digits.index(b)] = 1\n",
" return acc\n",
"\n",
"acc_en = compute_acc(outputs_en)\n",
"acc_ko = compute_acc(outputs_ko)\n",
"\n",
"# Print overall results\n",
"total = acc_en.size\n",
"correct_en = acc_en.sum()\n",
"correct_ko = acc_ko.sum()\n",
"print(f\"English prompts: {correct_en}/{total} correct ({correct_en/total:.2%})\")\n",
"print(f\"Korean prompts: {correct_ko}/{total} correct ({correct_ko/total:.2%})\")\n",
"\n",
"# Plot heatmaps without per-cell numbers, just green=1/red=0\n",
"cmap = ListedColormap(['red', 'green'])\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 6))\n",
"\n",
"# Show ticks only every 10th to reduce clutter\n",
"ticks = list(range(0, 100, 10))\n",
"labels = [digits[i] for i in ticks]\n",
"\n",
"for ax, acc, title in zip(axes, [acc_en, acc_ko], [\"English Prompt\", \"Korean Prompt\"]):\n",
" ax.imshow(acc, aspect='equal', cmap=cmap, vmin=0, vmax=1)\n",
" ax.set_xticks(ticks)\n",
" ax.set_xticklabels(labels)\n",
" ax.set_yticks(ticks)\n",
" ax.set_yticklabels(labels)\n",
" ax.set_xlabel('Multiplier (b)')\n",
" ax.set_ylabel('Multiplicand (a)')\n",
" ax.set_title(title)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"del llm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## EXAONE-3.5-2.4B-Instruct 평가"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# Ensure V1 is enabled by default; to disable set VLLM_USE_V1=0 in your env\n",
"os.environ.setdefault(\"VLLM_USE_V1\", \"0\")\n",
"\n",
"print(\"\"\"\n",
"V1 is now enabled by default for all supported use cases, and we will gradually enable it\n",
"for every use case we plan to support. Please share any feedback on GitHub or in the vLLM Slack.\n",
"\n",
"To disable V1, please set the environment variable as: VLLM_USE_V1=0, and send us a GitHub issue sharing the reason!\n",
"\"\"\".strip())\n",
"\n",
"from vllm import LLM, SamplingParams\n",
"import re\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"# Initialize model\n",
"llm = LLM(model=\"LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct\")\n",
"\n",
"# Deterministic sampling\n",
"sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=20) # max_tokens 조정: [10, 20, 40, 80]\n",
"\n",
"# Use numbers 1–100\n",
"digits = list(range(1, 101))\n",
"pairs = [(a, b) for a in digits for b in digits]\n",
"\n",
"# Build prompts\n",
"prompts_en = [f\"What is {a} × {b}? Please answer in \\\\boxed{{}} format.\" for a, b in pairs]\n",
"prompts_ko = [f\"{a} 곱하기 {b} 는 무엇입니까? \\\\boxed{{}} 형식으로 대답해주세요.\" for a, b in pairs]\n",
"\n",
"def build_inputs(prompts):\n",
" tok = llm.get_tokenizer()\n",
" system_msg = {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are EXAONE model from LG AI Research, a helpful assistant.\"\n",
" }\n",
" inputs = []\n",
" for p in prompts:\n",
" chat = [\n",
" {\"role\": \"tool_list\", \"content\": \"\"},\n",
" system_msg,\n",
" {\"role\": \"user\", \"content\": p},\n",
" ]\n",
" inputs.append(tok.apply_chat_template(chat, add_generation_prompt=True, tokenize=False))\n",
" return inputs\n",
"\n",
"# Generate\n",
"outputs_en = llm.generate(build_inputs(prompts_en), sampling_params)\n",
"outputs_ko = llm.generate(build_inputs(prompts_ko), sampling_params)\n",
"\n",
"# Compute accuracy matrices\n",
"def compute_acc(outputs):\n",
" acc = np.zeros((100, 100), dtype=int)\n",
" for idx, out in enumerate(outputs):\n",
" a, b = pairs[idx]\n",
" m = re.search(r'\\\\boxed\\{(\\d+)\\}', out.outputs[0].text)\n",
" if m and int(m.group(1)) == a * b:\n",
" acc[digits.index(a), digits.index(b)] = 1\n",
" return acc\n",
"\n",
"acc_en = compute_acc(outputs_en)\n",
"acc_ko = compute_acc(outputs_ko)\n",
"\n",
"# Print overall results\n",
"total = acc_en.size\n",
"correct_en = acc_en.sum()\n",
"correct_ko = acc_ko.sum()\n",
"print(f\"English prompts: {correct_en}/{total} correct ({correct_en/total:.2%})\")\n",
"print(f\"Korean prompts: {correct_ko}/{total} correct ({correct_ko/total:.2%})\")\n",
"\n",
"# Plot heatmaps without per-cell numbers, just green=1/red=0\n",
"cmap = ListedColormap(['red', 'green'])\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 6))\n",
"\n",
"# Show ticks only every 10th to reduce clutter\n",
"ticks = list(range(0, 100, 10))\n",
"labels = [digits[i] for i in ticks]\n",
"\n",
"for ax, acc, title in zip(axes, [acc_en, acc_ko], [\"English Prompt\", \"Korean Prompt\"]):\n",
" ax.imshow(acc, aspect='equal', cmap=cmap, vmin=0, vmax=1)\n",
" ax.set_xticks(ticks)\n",
" ax.set_xticklabels(labels)\n",
" ax.set_yticks(ticks)\n",
" ax.set_yticklabels(labels)\n",
" ax.set_xlabel('Multiplier (b)')\n",
" ax.set_ylabel('Multiplicand (a)')\n",
" ax.set_title(title)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"del llm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Kanana-nano-2.1b-instruct 평가"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# Ensure V1 is enabled by default; to disable set VLLM_USE_V1=0 in your env\n",
"os.environ.setdefault(\"VLLM_USE_V1\", \"1\")\n",
"\n",
"print(\"\"\"\n",
"V1 is now enabled by default for all supported use cases, and we will gradually enable it\n",
"for every use case we plan to support. Please share any feedback on GitHub or in the vLLM Slack.\n",
"\n",
"To disable V1, please set the environment variable as: VLLM_USE_V1=0, and send us a GitHub issue sharing the reason!\n",
"\"\"\".strip())\n",
"\n",
"from vllm import LLM, SamplingParams\n",
"import re\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"# Initialize model\n",
"llm = LLM(model=\"kakaocorp/kanana-nano-2.1b-instruct\")\n",
"\n",
"# Deterministic sampling\n",
"sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=20) # max_tokens 조정: [10, 20, 40, 80]\n",
"\n",
"# Use numbers 1–100\n",
"digits = list(range(1, 101))\n",
"pairs = [(a, b) for a in digits for b in digits]\n",
"\n",
"# Build prompts\n",
"prompts_en = [f\"What is {a} × {b}? Please answer in \\\\boxed{{}} format.\" for a, b in pairs]\n",
"prompts_ko = [f\"{a} 곱하기 {b} 는 무엇입니까? \\\\boxed{{}} 형식으로 대답해주세요.\" for a, b in pairs]\n",
"\n",
"def build_inputs(prompts):\n",
" tok = llm.get_tokenizer()\n",
" system_msg = {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a helpful AI assistant developed by Kakao.\"\n",
" }\n",
" inputs = []\n",
" for p in prompts:\n",
" chat = [\n",
" {\"role\": \"tool_list\", \"content\": \"\"},\n",
" system_msg,\n",
" {\"role\": \"user\", \"content\": p},\n",
" ]\n",
" inputs.append(tok.apply_chat_template(chat, add_generation_prompt=True, tokenize=False))\n",
" return inputs\n",
"\n",
"# Generate\n",
"outputs_en = llm.generate(build_inputs(prompts_en), sampling_params)\n",
"outputs_ko = llm.generate(build_inputs(prompts_ko), sampling_params)\n",
"\n",
"# Compute accuracy matrices\n",
"def compute_acc(outputs):\n",
" acc = np.zeros((100, 100), dtype=int)\n",
" for idx, out in enumerate(outputs):\n",
" a, b = pairs[idx]\n",
" m = re.search(r'\\\\boxed\\{(\\d+)\\}', out.outputs[0].text)\n",
" if m and int(m.group(1)) == a * b:\n",
" acc[digits.index(a), digits.index(b)] = 1\n",
" return acc\n",
"\n",
"acc_en = compute_acc(outputs_en)\n",
"acc_ko = compute_acc(outputs_ko)\n",
"\n",
"# Print overall results\n",
"total = acc_en.size\n",
"correct_en = acc_en.sum()\n",
"correct_ko = acc_ko.sum()\n",
"print(f\"English prompts: {correct_en}/{total} correct ({correct_en/total:.2%})\")\n",
"print(f\"Korean prompts: {correct_ko}/{total} correct ({correct_ko/total:.2%})\")\n",
"\n",
"# Plot heatmaps without per-cell numbers, just green=1/red=0\n",
"cmap = ListedColormap(['red', 'green'])\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 6))\n",
"\n",
"# Show ticks only every 10th to reduce clutter\n",
"ticks = list(range(0, 100, 10))\n",
"labels = [digits[i] for i in ticks]\n",
"\n",
"for ax, acc, title in zip(axes, [acc_en, acc_ko], [\"English Prompt\", \"Korean Prompt\"]):\n",
" ax.imshow(acc, aspect='equal', cmap=cmap, vmin=0, vmax=1)\n",
" ax.set_xticks(ticks)\n",
" ax.set_xticklabels(labels)\n",
" ax.set_yticks(ticks)\n",
" ax.set_yticklabels(labels)\n",
" ax.set_xlabel('Multiplier (b)')\n",
" ax.set_ylabel('Multiplicand (a)')\n",
" ax.set_title(title)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"del llm"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment