Last active
November 22, 2023 08:40
-
-
Save Katsumata420/2f1ecb7a36e804a51a0e3beb4a563a7c to your computer and use it in GitHub Desktop.
Run open-llm-leaderboard at local
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""追加で実施した lm-evaluation-harness の結果を wandb に Upload する | |
注意事項: | |
- batch_size, commit_id は、lm-evaluation-harness の実行時のものを指定すること | |
- is_write_out もできれば lm-evaluation-harness の実行時のものを指定すること | |
- average は追加したタスクを反映させた結果が上書きされる | |
- artifact は追加で実施した lm-evaluation-harness の結果のみ Upload される(ただし、以前に実行した結果がローカルに残っている場合は、それも Upload される) | |
- 古い結果は wandb の UI 上で version を選択して確認する | |
""" | |
import argparse | |
import json | |
import os | |
import math | |
import warnings | |
from typing import List, Dict | |
import wandb | |
from save_wandb import WANDB_TABLE_NAME, TASK_METRIC_MAPPING, SingleTaskResult | |
def get_args() -> argparse.Namespace: | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ") | |
parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか") | |
parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル") | |
parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の追加実行時間 [sec]") | |
parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名") | |
parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名") | |
parser.add_argument("--tasks", nargs="+", required=True, help="追加評価対象のタスク名") | |
return parser.parse_args() | |
def load_results(result_dir: str, is_write_out: bool, target_model: str, tasks: List[str]) -> Dict[str, SingleTaskResult]: | |
"""tasks に含まれるタスクの結果を読み込む""" | |
result_files = [f"{result_dir}/{task}.json" for task in tasks] | |
results: Dict[str, SingleTaskResult] = {} | |
for result_file in result_files: | |
with open(result_file) as f: | |
result = json.load(f) | |
task_name = os.path.basename(result_file).replace(".json", "") | |
task_scores = result["results"] | |
config = result["config"] | |
write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None | |
results[task_name] = SingleTaskResult( | |
results=task_scores, | |
result_json_file=result_file, | |
write_out_dir=write_out_dir, | |
config=config, | |
) | |
return results | |
def upload_wandb( | |
data: Dict[str, SingleTaskResult], | |
entity_name: str, | |
project_name: str, | |
target_model: str, | |
elapsed_time: int, | |
) -> None: | |
"""wandb に Upload を試みる""" | |
def get_run(entity_name: str, project_name: str, target_model: str) -> wandb.apis.public.Run: | |
api = wandb.Api() | |
runs = api.runs(f"{entity_name}/{project_name}", filters={"config.target_model": target_model}) | |
assert len(runs) == 1, f"len(runs) must be 1, but {len(runs)}" | |
run = list(runs)[0] | |
return run | |
def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float: | |
"""lm-evaluation-harness で出力された結果を平均して単一の値にする""" | |
scores = [] | |
for _, metric_score in lm_evaluation_results.items(): | |
target_metric = TASK_METRIC_MAPPING[task_name] | |
score: float = metric_score[target_metric] | |
if math.isnan(score): | |
warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。") | |
scores.append(score) | |
return sum(scores) / len(scores) | |
def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str: | |
"""lm-evaulation-harness の出力ディレクトリを抽出する""" | |
if data_type == "result": | |
return os.path.dirname(data[list(data.keys())[0]].result_json_file) | |
elif data_type == "output": | |
return os.path.dirname(data[list(data.keys())[0]].write_out_dir) | |
else: | |
raise ValueError(f"Invalid data_type: {data_type}") | |
def get_new_average(updated_table: wandb.Table) -> float: | |
"""average を再計算する | |
除くカラム: model_name, Average, Elapsed Time と nan と None が含まれるカラム | |
wandb は nan の値を None として扱うため、None が含まれるカラムも除く | |
""" | |
exclude_columns = ["model_name", "Average", "Elapsed Time"] | |
target_scores = updated_table.data[0] | |
target_scores = [ | |
score | |
for column, score in zip(updated_table.columns, target_scores) | |
if column not in exclude_columns and score is not None and not math.isnan(score) | |
] | |
return sum(target_scores) / len(target_scores) | |
run = get_run(entity_name, project_name, target_model) | |
run_id = run.id | |
run_config = json.loads(run.json_config) # {Key: {"value": value}, {"desc": desc}, ...} | |
wandb_config = {key: value["value"] for key, value in run_config.items()} | |
with wandb.init(id=run_id, project=project_name, entity=entity_name, config=wandb_config) as run: | |
# get table | |
table_name = WANDB_TABLE_NAME.replace("-", "") # wandb.Table に渡すときに - が消えるので、wandb.Table から取得するときには - を消す | |
artifact_name = f"{entity_name}/{project_name}/run-{run.id}-{table_name}:latest" | |
api_artifact = wandb.Api().artifact(artifact_name) | |
table = run.use_artifact(api_artifact).get(WANDB_TABLE_NAME) | |
# update average, elapsed time and task | |
updated_table = wandb.Table(columns=table.columns, data=table.data) | |
column2index = {column: index for index, column in enumerate(updated_table.columns)} | |
updated_table.data[0][column2index["Elapsed Time"]] += elapsed_time | |
for task_name, task_result in data.items(): | |
updated_table.add_column(task_name, [post_process_results(task_result.results, task_name)]) | |
# average の再計算 | |
# model_name と古い Average と elapsed_time を除いて、average を計算する | |
updated_table.data[0][column2index["Average"]] = get_new_average(updated_table) | |
# update table | |
run.log({WANDB_TABLE_NAME: updated_table}) | |
# save artifact | |
result_json_dir = extract_dir_info(data, data_type="result") | |
result_artifact_name = target_model.replace("/", ".") + ".result" | |
result_artifact = wandb.Artifact( | |
result_artifact_name, type="lm-evaluation-harness-result" | |
) | |
result_artifact.add_dir(result_json_dir) | |
wandb.log_artifact(result_artifact) | |
if task_result.write_out_dir is not None: | |
result_output_dir = extract_dir_info(data, data_type="output") | |
output_artifact_name = target_model.replace("/", ".") + ".output" | |
output_artifact = wandb.Artifact( | |
output_artifact_name, type="lm-evaluation-harness-output" | |
) | |
output_artifact.add_dir(result_output_dir) | |
wandb.log_artifact(output_artifact) | |
print("Finish Upload.") | |
def main(): | |
args = get_args() | |
# 追加で実施された lm-evaluation-harness の結果を読み込む | |
data = load_results(args.result_dir, args.is_write_out, args.target_model, args.tasks) | |
# wandb に Upload する | |
upload_wandb( | |
data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.elapsed_time | |
) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -eu | |
export HF_HOME= # Need setting | |
WANDB_ENTITY= # Need setting | |
WANDB_PROJECT= # Need setting | |
WRITE_OUT_HARNESS="" | |
WRITE_OUT_PATH="" | |
WRITE_OUT_WANDB="" | |
PEFT_BASE_MODEL="" | |
while getopts "wl:" OPT | |
do | |
case $OPT in | |
w) | |
WRITE_OUT_HARNESS="--write_out --output_base_path" | |
WRITE_OUT_PATH="./write_out" | |
WRITE_OUT_WANDB="--is_write_out" | |
;; | |
l) | |
PEFT_BASE_MODEL="${OPTARG}";; | |
esac | |
done | |
shift $((OPTIND - 1)) | |
target_model=$1 | |
batch_size=$2 | |
result_dir=$3 | |
n_shot_task=( | |
"arc-challenge 25 arc_challenge" | |
"hellaswag 10 hellaswag" | |
"truthfulqa-mc 0 truthfulqa_mc" | |
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions" | |
"winogrande 5 winogrande" | |
"gsm8k 5 gsm8k" | |
"drop 3 drop" | |
) | |
echo "Target model: ${target_model}" | |
echo "Batch size: ${batch_size}" | |
echo "Result dir: ${result_dir}" | |
if [ "${PEFT_BASE_MODEL}" != "" ]; then | |
echo "PEFT base model: ${PEFT_BASE_MODEL}" | |
HARNESS_MODEL_TYPE="hf-causal-experimental" | |
HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True" | |
else | |
# default | |
HARNESS_MODEL_TYPE="hf-causal" | |
HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True" | |
fi | |
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model} | |
mkdir -p ${WRITE_OUT_PATH} | |
start_time=`date +%s` | |
for current_n_shot_task in "${n_shot_task[@]}" | |
do | |
# task-name number-of-shot task-list | |
current_task=(${current_n_shot_task}) | |
echo "---" | |
echo "task name: ${current_task[0]}" | |
echo "n-shot: ${current_task[1]}" | |
echo "task list: ${current_task[2]}" | |
if [ "${WRITE_OUT_HARNESS}" != "" ]; then | |
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]} | |
fi | |
output_path="${result_dir}/${current_task[0]}".json | |
python main.py \ | |
--model ${HARNESS_MODEL_TYPE} \ | |
--model_args ${HARNESS_MODEL_ARGS} \ | |
--num_fewshot ${current_task[1]} \ | |
--tasks ${current_task[2]} \ | |
--batch_size ${batch_size} \ | |
--output_path ${output_path} \ | |
${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH} | |
if [ "${WRITE_OUT_HARNESS}" != "" ]; then | |
WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}` | |
fi | |
done | |
end_time=`date +%s` | |
elapsed_time=$((end_time - start_time)) | |
commit_id=`git show --format='%h' --no-patch` | |
python save_wandb.py \ | |
--result_dir ${result_dir}\ | |
--target_model ${target_model} \ | |
--batch_size ${batch_size} \ | |
--commit_id ${commit_id} \ | |
--elapsed_time ${elapsed_time} \ | |
--wandb_entity_name ${WANDB_ENTITY} \ | |
--wandb_project_name ${WANDB_PROJECT} \ | |
${WRITE_OUT_WANDB} | |
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -eu | |
export HF_HOME= # Need setting | |
WANDB_ENTITY= # Need setting | |
WANDB_PROJECT= # Need setting | |
WRITE_OUT_HARNESS="" | |
WRITE_OUT_PATH="" | |
WRITE_OUT_WANDB="" | |
PEFT_BASE_MODEL="" | |
while getopts "wl:" OPT | |
do | |
case $OPT in | |
w) | |
WRITE_OUT_HARNESS="--write_out --output_base_path" | |
WRITE_OUT_PATH="./write_out" | |
WRITE_OUT_WANDB="--is_write_out" | |
;; | |
l) | |
PEFT_BASE_MODEL="${OPTARG}";; | |
esac | |
done | |
shift $((OPTIND - 1)) | |
target_model=$1 | |
batch_size=$2 | |
result_dir=$3 | |
# task-name number-of-shot task-list | |
n_shot_task=( | |
"arc-challenge 25 arc_challenge" | |
"hellaswag 10 hellaswag" | |
"truthfulqa-mc 0 truthfulqa_mc" | |
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions" | |
"winogrande 5 winogrande" | |
"gsm8k 5 gsm8k" | |
"drop 3 drop" | |
) | |
echo "Target model: ${target_model}" | |
echo "Batch size: ${batch_size}" | |
echo "Result dir: ${result_dir}" | |
if [ "${PEFT_BASE_MODEL}" != "" ]; then | |
echo "PEFT base model: ${PEFT_BASE_MODEL}" | |
HARNESS_MODEL_TYPE="hf-causal-experimental" | |
HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True" | |
else | |
# default | |
HARNESS_MODEL_TYPE="hf-causal" | |
HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True" | |
fi | |
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model} | |
mkdir -p ${WRITE_OUT_PATH} | |
start_time=`date +%s` | |
done_task="" | |
for current_n_shot_task in "${n_shot_task[@]}" | |
do | |
# task-name number-of-shot task-list | |
current_task=(${current_n_shot_task}) | |
echo "---" | |
echo "task name: ${current_task[0]}" | |
echo "n-shot: ${current_task[1]}" | |
echo "task list: ${current_task[2]}" | |
if [ "${WRITE_OUT_HARNESS}" != "" ]; then | |
WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]} | |
fi | |
output_path="${result_dir}/${current_task[0]}".json | |
python main.py \ | |
--model ${HARNESS_MODEL_TYPE} \ | |
--model_args ${HARNESS_MODEL_ARGS} \ | |
--num_fewshot ${current_task[1]} \ | |
--tasks ${current_task[2]} \ | |
--batch_size ${batch_size} \ | |
--output_path ${output_path} \ | |
${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH} | |
if [ "${WRITE_OUT_HARNESS}" != "" ]; then | |
WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}` | |
fi | |
done_task="${done_task} ${current_task[0]}" | |
done | |
end_time=`date +%s` | |
elapsed_time=$((end_time - start_time)) | |
echo "Tasks: ${done_task}" | |
python additional_save_wandb.py \ | |
--result_dir ${result_dir}\ | |
--target_model ${target_model} \ | |
--elapsed_time ${elapsed_time} \ | |
--wandb_entity_name ${WANDB_ENTITY} \ | |
--wandb_project_name ${WANDB_PROJECT} \ | |
--tasks ${done_task} \ | |
${WRITE_OUT_WANDB} | |
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""lm-evaluation-harness の結果を wandb にアップロードするスニペット | |
アップロードする項目は次の通り: | |
- テストデータに対する各指標 | |
- lm-evaluation-harness の json ファイル一覧 | |
- lm-evaluation-harness の commit-id | |
- lm-evaluation-harness の実行時間 | |
- lm-evaluation-harness の引数(評価対象のモデル、バッチサイズ) | |
- lm-evaluation-hanress の write_out 出力ファイル | |
""" | |
import argparse | |
import json | |
import glob | |
import os | |
import math | |
import warnings | |
from dataclasses import dataclass | |
from typing import Dict, Optional | |
import wandb | |
WANDB_TABLE_NAME = "open-llm-leaderboard" | |
TASK_METRIC_MAPPING = { # https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard | |
"arc-challenge": "acc_norm", | |
"hellaswag": "acc_norm", | |
"truthfulqa-mc": "mc2", | |
"mmlu": "acc", | |
"winogrande": "acc", | |
"gsm8k": "acc", | |
"drop": "f1", | |
} | |
@dataclass | |
class SingleTaskResult: | |
results: Dict[str, Dict[str, float]] | |
result_json_file: str | |
write_out_dir: Optional[str] | |
config: Dict | |
def get_args() -> argparse.Namespace: | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ") | |
parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか") | |
parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル") | |
parser.add_argument("--batch_size", type=int, required=True, help="評価時のバッチサイズ") | |
parser.add_argument("--commit_id", type=str, required=True, help="lm-evaluation-harness の commit-id") | |
parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の実行時間 [sec]") | |
parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名") | |
parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名") | |
return parser.parse_args() | |
def load_results(result_dir: str, is_write_out: bool, target_model: str) -> Dict[str, SingleTaskResult]: | |
"""lm-evaluation-harness の結果を読み込む | |
Returns: | |
Dict[str, SingleTaskResult]: タスク名をキーとした結果の辞書 | |
""" | |
result_files = glob.glob(f"{result_dir}/*.json") | |
results: Dict[str, SingleTaskResult] = {} | |
for result_file in result_files: | |
with open(result_file) as f: | |
result = json.load(f) | |
task_name = os.path.basename(result_file).replace(".json", "") | |
task_scores = result["results"] | |
config = result["config"] | |
write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None | |
results[task_name] = SingleTaskResult( | |
results=task_scores, | |
result_json_file=result_file, | |
write_out_dir=write_out_dir, | |
config=config, | |
) | |
return results | |
def upload_wandb( | |
data: Dict[str, SingleTaskResult], | |
entity_name: str, | |
project_name: str, | |
target_model: str, | |
commit_id: str, | |
elapsed_time: int, | |
batch_size: int, | |
) -> None: | |
"""wandb に Upload を試みる | |
wandb に保存する内容は次の通り | |
- テストデータに対する各指標(avg 含む): Table として保存する | |
- lm-evaluation-harness の json ファイル一覧: artifact として保存する | |
- lm-evaluation-harness の commit-id: config | |
- lm-evaluation-harness の実行時間: Table | |
- lm-evaluation-harness の引数(評価対象のモデル、バッチサイズ): config(モデル名は Table にも保存) | |
- lm-evaluation-hanress の write_out 出力ファイル: artifact | |
""" | |
def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float: | |
"""lm-evaluation-harness で出力された結果を平均を利用して単一の float 値にする""" | |
scores = [] # 設定した評価指標のスコアが格納される | |
for _, metric_score in lm_evaluation_results.items(): | |
target_metric = TASK_METRIC_MAPPING[task_name] | |
score: float = metric_score[target_metric] | |
if math.isnan(score): | |
warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。") | |
scores.append(score) | |
return sum(scores) / len(scores) | |
def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str: | |
"""lm-evaluation-harness の出力ディレクトリを抽出する""" | |
if data_type == "result": | |
return os.path.dirname(data[list(data.keys())[0]].result_json_file) | |
elif data_type == "output": | |
return os.path.dirname(data[list(data.keys())[0]].write_out_dir) | |
else: | |
raise ValueError(f"Invalid data_type: {data_type}") | |
wandb_config = { | |
"lm_evaluation_harness_commit_id": commit_id, | |
"target_model": target_model, | |
"batch_size": batch_size, | |
} | |
with wandb.init( | |
entity=entity_name, project=project_name, name=target_model, config=wandb_config | |
) as run: | |
columns = ["model_name", "Average"] # model_name + avg_score + task_name + elapsed_time | |
items = [target_model] # model_name + avg_score (insert) + task_name + elapsed_time | |
for task_name, result in data.items(): | |
# make table | |
columns.append(task_name) | |
result_for_table = post_process_results(result.results, task_name) | |
items.append(result_for_table) | |
scores_without_nan = [score for score in items[1:] if not math.isnan(score)] | |
avg_score = sum(scores_without_nan) / len(scores_without_nan) | |
items.insert(1, avg_score) | |
# Add time column | |
columns.append("Elapsed Time") | |
items.append(elapsed_time) | |
# Add 1D array and save table | |
lm_evaluation_harness_table = wandb.Table(columns=columns, data=[items]) | |
run.log({WANDB_TABLE_NAME: lm_evaluation_harness_table}) | |
# save artifact | |
result_json_dir = extract_dir_info(data, data_type="result") | |
result_artifact_name = target_model.replace("/", ".") + ".result" | |
result_artifact = wandb.Artifact( | |
result_artifact_name, type="lm-evaluation-harness-result" | |
) | |
result_artifact.add_dir(result_json_dir) | |
wandb.log_artifact(result_artifact) | |
if result.write_out_dir is not None: | |
result_output_dir = extract_dir_info(data, data_type="output") | |
output_artifact_name = target_model.replace("/", ".") + ".output" | |
output_artifact = wandb.Artifact( | |
output_artifact_name, type="lm-evaluation-harness-output" | |
) | |
output_artifact.add_dir(result_output_dir) | |
wandb.log_artifact(output_artifact) | |
print("Finish Upload.") | |
def main(): | |
args = get_args() | |
# lm-evaluation-harness の結果を読み込む | |
data = load_results(args.result_dir, args.is_write_out, args.target_model) | |
# wandb にアップロードする | |
upload_wandb( | |
data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.commit_id, args.elapsed_time, args.batch_size | |
) | |
if __name__ == "__main__": | |
main() |
For peft model, use the following command:
bash run_open_llm_leaderboard.sh -w -l ${base_model_name_or_path} ${peft_model_name_or_path} ${batch_size} ${result_dir}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage