Skip to content

Instantly share code, notes, and snippets.

@Katsumata420
Last active November 22, 2023 08:40
Show Gist options
  • Save Katsumata420/2f1ecb7a36e804a51a0e3beb4a563a7c to your computer and use it in GitHub Desktop.
Save Katsumata420/2f1ecb7a36e804a51a0e3beb4a563a7c to your computer and use it in GitHub Desktop.
Run open-llm-leaderboard at local
"""追加で実施した lm-evaluation-harness の結果を wandb に Upload する
注意事項:
- batch_size, commit_id は、lm-evaluation-harness の実行時のものを指定すること
- is_write_out もできれば lm-evaluation-harness の実行時のものを指定すること
- average は追加したタスクを反映させた結果が上書きされる
- artifact は追加で実施した lm-evaluation-harness の結果のみ Upload される(ただし、以前に実行した結果がローカルに残っている場合は、それも Upload される)
- 古い結果は wandb の UI 上で version を選択して確認する
"""
import argparse
import json
import os
import math
import warnings
from typing import List, Dict
import wandb
from save_wandb import WANDB_TABLE_NAME, TASK_METRIC_MAPPING, SingleTaskResult
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の追加実行時間 [sec]")
parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
parser.add_argument("--tasks", nargs="+", required=True, help="追加評価対象のタスク名")
return parser.parse_args()
def load_results(result_dir: str, is_write_out: bool, target_model: str, tasks: List[str]) -> Dict[str, SingleTaskResult]:
"""tasks に含まれるタスクの結果を読み込む"""
result_files = [f"{result_dir}/{task}.json" for task in tasks]
results: Dict[str, SingleTaskResult] = {}
for result_file in result_files:
with open(result_file) as f:
result = json.load(f)
task_name = os.path.basename(result_file).replace(".json", "")
task_scores = result["results"]
config = result["config"]
write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None
results[task_name] = SingleTaskResult(
results=task_scores,
result_json_file=result_file,
write_out_dir=write_out_dir,
config=config,
)
return results
def upload_wandb(
data: Dict[str, SingleTaskResult],
entity_name: str,
project_name: str,
target_model: str,
elapsed_time: int,
) -> None:
"""wandb に Upload を試みる"""
def get_run(entity_name: str, project_name: str, target_model: str) -> wandb.apis.public.Run:
api = wandb.Api()
runs = api.runs(f"{entity_name}/{project_name}", filters={"config.target_model": target_model})
assert len(runs) == 1, f"len(runs) must be 1, but {len(runs)}"
run = list(runs)[0]
return run
def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
"""lm-evaluation-harness で出力された結果を平均して単一の値にする"""
scores = []
for _, metric_score in lm_evaluation_results.items():
target_metric = TASK_METRIC_MAPPING[task_name]
score: float = metric_score[target_metric]
if math.isnan(score):
warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
scores.append(score)
return sum(scores) / len(scores)
def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
"""lm-evaulation-harness の出力ディレクトリを抽出する"""
if data_type == "result":
return os.path.dirname(data[list(data.keys())[0]].result_json_file)
elif data_type == "output":
return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
else:
raise ValueError(f"Invalid data_type: {data_type}")
def get_new_average(updated_table: wandb.Table) -> float:
"""average を再計算する
除くカラム: model_name, Average, Elapsed Time と nan と None が含まれるカラム
wandb は nan の値を None として扱うため、None が含まれるカラムも除く
"""
exclude_columns = ["model_name", "Average", "Elapsed Time"]
target_scores = updated_table.data[0]
target_scores = [
score
for column, score in zip(updated_table.columns, target_scores)
if column not in exclude_columns and score is not None and not math.isnan(score)
]
return sum(target_scores) / len(target_scores)
run = get_run(entity_name, project_name, target_model)
run_id = run.id
run_config = json.loads(run.json_config) # {Key: {"value": value}, {"desc": desc}, ...}
wandb_config = {key: value["value"] for key, value in run_config.items()}
with wandb.init(id=run_id, project=project_name, entity=entity_name, config=wandb_config) as run:
# get table
table_name = WANDB_TABLE_NAME.replace("-", "") # wandb.Table に渡すときに - が消えるので、wandb.Table から取得するときには - を消す
artifact_name = f"{entity_name}/{project_name}/run-{run.id}-{table_name}:latest"
api_artifact = wandb.Api().artifact(artifact_name)
table = run.use_artifact(api_artifact).get(WANDB_TABLE_NAME)
# update average, elapsed time and task
updated_table = wandb.Table(columns=table.columns, data=table.data)
column2index = {column: index for index, column in enumerate(updated_table.columns)}
updated_table.data[0][column2index["Elapsed Time"]] += elapsed_time
for task_name, task_result in data.items():
updated_table.add_column(task_name, [post_process_results(task_result.results, task_name)])
# average の再計算
# model_name と古い Average と elapsed_time を除いて、average を計算する
updated_table.data[0][column2index["Average"]] = get_new_average(updated_table)
# update table
run.log({WANDB_TABLE_NAME: updated_table})
# save artifact
result_json_dir = extract_dir_info(data, data_type="result")
result_artifact_name = target_model.replace("/", ".") + ".result"
result_artifact = wandb.Artifact(
result_artifact_name, type="lm-evaluation-harness-result"
)
result_artifact.add_dir(result_json_dir)
wandb.log_artifact(result_artifact)
if task_result.write_out_dir is not None:
result_output_dir = extract_dir_info(data, data_type="output")
output_artifact_name = target_model.replace("/", ".") + ".output"
output_artifact = wandb.Artifact(
output_artifact_name, type="lm-evaluation-harness-output"
)
output_artifact.add_dir(result_output_dir)
wandb.log_artifact(output_artifact)
print("Finish Upload.")
def main():
args = get_args()
# 追加で実施された lm-evaluation-harness の結果を読み込む
data = load_results(args.result_dir, args.is_write_out, args.target_model, args.tasks)
# wandb に Upload する
upload_wandb(
data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.elapsed_time
)
if __name__ == "__main__":
main()
#!/bin/bash
set -eu
export HF_HOME= # Need setting
WANDB_ENTITY= # Need setting
WANDB_PROJECT= # Need setting
WRITE_OUT_HARNESS=""
WRITE_OUT_PATH=""
WRITE_OUT_WANDB=""
PEFT_BASE_MODEL=""
while getopts "wl:" OPT
do
case $OPT in
w)
WRITE_OUT_HARNESS="--write_out --output_base_path"
WRITE_OUT_PATH="./write_out"
WRITE_OUT_WANDB="--is_write_out"
;;
l)
PEFT_BASE_MODEL="${OPTARG}";;
esac
done
shift $((OPTIND - 1))
target_model=$1
batch_size=$2
result_dir=$3
n_shot_task=(
"arc-challenge 25 arc_challenge"
"hellaswag 10 hellaswag"
"truthfulqa-mc 0 truthfulqa_mc"
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
"winogrande 5 winogrande"
"gsm8k 5 gsm8k"
"drop 3 drop"
)
echo "Target model: ${target_model}"
echo "Batch size: ${batch_size}"
echo "Result dir: ${result_dir}"
if [ "${PEFT_BASE_MODEL}" != "" ]; then
echo "PEFT base model: ${PEFT_BASE_MODEL}"
HARNESS_MODEL_TYPE="hf-causal-experimental"
HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True"
else
# default
HARNESS_MODEL_TYPE="hf-causal"
HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True"
fi
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model}
mkdir -p ${WRITE_OUT_PATH}
start_time=`date +%s`
for current_n_shot_task in "${n_shot_task[@]}"
do
# task-name number-of-shot task-list
current_task=(${current_n_shot_task})
echo "---"
echo "task name: ${current_task[0]}"
echo "n-shot: ${current_task[1]}"
echo "task list: ${current_task[2]}"
if [ "${WRITE_OUT_HARNESS}" != "" ]; then
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]}
fi
output_path="${result_dir}/${current_task[0]}".json
python main.py \
--model ${HARNESS_MODEL_TYPE} \
--model_args ${HARNESS_MODEL_ARGS} \
--num_fewshot ${current_task[1]} \
--tasks ${current_task[2]} \
--batch_size ${batch_size} \
--output_path ${output_path} \
${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH}
if [ "${WRITE_OUT_HARNESS}" != "" ]; then
WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}`
fi
done
end_time=`date +%s`
elapsed_time=$((end_time - start_time))
commit_id=`git show --format='%h' --no-patch`
python save_wandb.py \
--result_dir ${result_dir}\
--target_model ${target_model} \
--batch_size ${batch_size} \
--commit_id ${commit_id} \
--elapsed_time ${elapsed_time} \
--wandb_entity_name ${WANDB_ENTITY} \
--wandb_project_name ${WANDB_PROJECT} \
${WRITE_OUT_WANDB}
exit
#!/bin/bash
set -eu
export HF_HOME= # Need setting
WANDB_ENTITY= # Need setting
WANDB_PROJECT= # Need setting
WRITE_OUT_HARNESS=""
WRITE_OUT_PATH=""
WRITE_OUT_WANDB=""
PEFT_BASE_MODEL=""
while getopts "wl:" OPT
do
case $OPT in
w)
WRITE_OUT_HARNESS="--write_out --output_base_path"
WRITE_OUT_PATH="./write_out"
WRITE_OUT_WANDB="--is_write_out"
;;
l)
PEFT_BASE_MODEL="${OPTARG}";;
esac
done
shift $((OPTIND - 1))
target_model=$1
batch_size=$2
result_dir=$3
# task-name number-of-shot task-list
n_shot_task=(
"arc-challenge 25 arc_challenge"
"hellaswag 10 hellaswag"
"truthfulqa-mc 0 truthfulqa_mc"
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
"winogrande 5 winogrande"
"gsm8k 5 gsm8k"
"drop 3 drop"
)
echo "Target model: ${target_model}"
echo "Batch size: ${batch_size}"
echo "Result dir: ${result_dir}"
if [ "${PEFT_BASE_MODEL}" != "" ]; then
echo "PEFT base model: ${PEFT_BASE_MODEL}"
HARNESS_MODEL_TYPE="hf-causal-experimental"
HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True"
else
# default
HARNESS_MODEL_TYPE="hf-causal"
HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True"
fi
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model}
mkdir -p ${WRITE_OUT_PATH}
start_time=`date +%s`
done_task=""
for current_n_shot_task in "${n_shot_task[@]}"
do
# task-name number-of-shot task-list
current_task=(${current_n_shot_task})
echo "---"
echo "task name: ${current_task[0]}"
echo "n-shot: ${current_task[1]}"
echo "task list: ${current_task[2]}"
if [ "${WRITE_OUT_HARNESS}" != "" ]; then
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]}
fi
output_path="${result_dir}/${current_task[0]}".json
python main.py \
--model ${HARNESS_MODEL_TYPE} \
--model_args ${HARNESS_MODEL_ARGS} \
--num_fewshot ${current_task[1]} \
--tasks ${current_task[2]} \
--batch_size ${batch_size} \
--output_path ${output_path} \
${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH}
if [ "${WRITE_OUT_HARNESS}" != "" ]; then
WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}`
fi
done_task="${done_task} ${current_task[0]}"
done
end_time=`date +%s`
elapsed_time=$((end_time - start_time))
echo "Tasks: ${done_task}"
python additional_save_wandb.py \
--result_dir ${result_dir}\
--target_model ${target_model} \
--elapsed_time ${elapsed_time} \
--wandb_entity_name ${WANDB_ENTITY} \
--wandb_project_name ${WANDB_PROJECT} \
--tasks ${done_task} \
${WRITE_OUT_WANDB}
exit
"""lm-evaluation-harness の結果を wandb にアップロードするスニペット
アップロードする項目は次の通り:
- テストデータに対する各指標
- lm-evaluation-harness の json ファイル一覧
- lm-evaluation-harness の commit-id
- lm-evaluation-harness の実行時間
- lm-evaluation-harness の引数(評価対象のモデル、バッチサイズ)
- lm-evaluation-hanress の write_out 出力ファイル
"""
import argparse
import json
import glob
import os
import math
import warnings
from dataclasses import dataclass
from typing import Dict, Optional
import wandb
WANDB_TABLE_NAME = "open-llm-leaderboard"
TASK_METRIC_MAPPING = { # https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
"arc-challenge": "acc_norm",
"hellaswag": "acc_norm",
"truthfulqa-mc": "mc2",
"mmlu": "acc",
"winogrande": "acc",
"gsm8k": "acc",
"drop": "f1",
}
@dataclass
class SingleTaskResult:
results: Dict[str, Dict[str, float]]
result_json_file: str
write_out_dir: Optional[str]
config: Dict
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
parser.add_argument("--batch_size", type=int, required=True, help="評価時のバッチサイズ")
parser.add_argument("--commit_id", type=str, required=True, help="lm-evaluation-harness の commit-id")
parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の実行時間 [sec]")
parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
return parser.parse_args()
def load_results(result_dir: str, is_write_out: bool, target_model: str) -> Dict[str, SingleTaskResult]:
"""lm-evaluation-harness の結果を読み込む
Returns:
Dict[str, SingleTaskResult]: タスク名をキーとした結果の辞書
"""
result_files = glob.glob(f"{result_dir}/*.json")
results: Dict[str, SingleTaskResult] = {}
for result_file in result_files:
with open(result_file) as f:
result = json.load(f)
task_name = os.path.basename(result_file).replace(".json", "")
task_scores = result["results"]
config = result["config"]
write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None
results[task_name] = SingleTaskResult(
results=task_scores,
result_json_file=result_file,
write_out_dir=write_out_dir,
config=config,
)
return results
def upload_wandb(
data: Dict[str, SingleTaskResult],
entity_name: str,
project_name: str,
target_model: str,
commit_id: str,
elapsed_time: int,
batch_size: int,
) -> None:
"""wandb に Upload を試みる
wandb に保存する内容は次の通り
- テストデータに対する各指標(avg 含む): Table として保存する
- lm-evaluation-harness の json ファイル一覧: artifact として保存する
- lm-evaluation-harness の commit-id: config
- lm-evaluation-harness の実行時間: Table
- lm-evaluation-harness の引数(評価対象のモデル、バッチサイズ): config(モデル名は Table にも保存)
- lm-evaluation-hanress の write_out 出力ファイル: artifact
"""
def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
"""lm-evaluation-harness で出力された結果を平均を利用して単一の float 値にする"""
scores = [] # 設定した評価指標のスコアが格納される
for _, metric_score in lm_evaluation_results.items():
target_metric = TASK_METRIC_MAPPING[task_name]
score: float = metric_score[target_metric]
if math.isnan(score):
warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
scores.append(score)
return sum(scores) / len(scores)
def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
"""lm-evaluation-harness の出力ディレクトリを抽出する"""
if data_type == "result":
return os.path.dirname(data[list(data.keys())[0]].result_json_file)
elif data_type == "output":
return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
else:
raise ValueError(f"Invalid data_type: {data_type}")
wandb_config = {
"lm_evaluation_harness_commit_id": commit_id,
"target_model": target_model,
"batch_size": batch_size,
}
with wandb.init(
entity=entity_name, project=project_name, name=target_model, config=wandb_config
) as run:
columns = ["model_name", "Average"] # model_name + avg_score + task_name + elapsed_time
items = [target_model] # model_name + avg_score (insert) + task_name + elapsed_time
for task_name, result in data.items():
# make table
columns.append(task_name)
result_for_table = post_process_results(result.results, task_name)
items.append(result_for_table)
scores_without_nan = [score for score in items[1:] if not math.isnan(score)]
avg_score = sum(scores_without_nan) / len(scores_without_nan)
items.insert(1, avg_score)
# Add time column
columns.append("Elapsed Time")
items.append(elapsed_time)
# Add 1D array and save table
lm_evaluation_harness_table = wandb.Table(columns=columns, data=[items])
run.log({WANDB_TABLE_NAME: lm_evaluation_harness_table})
# save artifact
result_json_dir = extract_dir_info(data, data_type="result")
result_artifact_name = target_model.replace("/", ".") + ".result"
result_artifact = wandb.Artifact(
result_artifact_name, type="lm-evaluation-harness-result"
)
result_artifact.add_dir(result_json_dir)
wandb.log_artifact(result_artifact)
if result.write_out_dir is not None:
result_output_dir = extract_dir_info(data, data_type="output")
output_artifact_name = target_model.replace("/", ".") + ".output"
output_artifact = wandb.Artifact(
output_artifact_name, type="lm-evaluation-harness-output"
)
output_artifact.add_dir(result_output_dir)
wandb.log_artifact(output_artifact)
print("Finish Upload.")
def main():
args = get_args()
# lm-evaluation-harness の結果を読み込む
data = load_results(args.result_dir, args.is_write_out, args.target_model)
# wandb にアップロードする
upload_wandb(
data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.commit_id, args.elapsed_time, args.batch_size
)
if __name__ == "__main__":
main()
@Katsumata420
Copy link
Author

Katsumata420 commented Nov 6, 2023

Usage

  1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git
  2. cd lm-evaluation-harness
  3. git checkout b281b09
  4. pip install -e .
  5. pip install wandb
  6. prepare wandb scripts (run_open_llm_leaderboard.sh, save_wandb.py)
  7. vim run_open_llm_leaderboard.sh (edit HF_HOME, WANDB_ENTITY and WANDB_PROJECT)
  8. wandb login
  9. bash run_open_llm_leaderboard.sh -w ${model_name_or_path} ${batch_size} ${result_dir}

@Katsumata420
Copy link
Author

For peft model, use the following command:

bash run_open_llm_leaderboard.sh -w -l ${base_model_name_or_path} ${peft_model_name_or_path} ${batch_size} ${result_dir}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment