Katsumata420/additional_save_wandb.py

## additional_save_wandb.py
"""追加で実施した lm-evaluation-harness の結果を wandb に Upload する

注意事項:
- batch_size, commit_id は、lm-evaluation-harness の実行時のものを指定すること
- is_write_out もできれば lm-evaluation-harness の実行時のものを指定すること
- average は追加したタスクを反映させた結果が上書きされる
- artifact は追加で実施した lm-evaluation-harness の結果のみ Upload される（ただし、以前に実行した結果がローカルに残っている場合は、それも Upload される）
  - 古い結果は wandb の UI 上で version を選択して確認する
"""
import argparse
import json
import os
import math
import warnings
from typing import List, Dict

import wandb

from save_wandb import WANDB_TABLE_NAME, TASK_METRIC_MAPPING, SingleTaskResult


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
    parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
    parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
    parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の追加実行時間 [sec]")
    parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
    parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
    parser.add_argument("--tasks", nargs="+", required=True, help="追加評価対象のタスク名")
    return parser.parse_args()


def load_results(result_dir: str, is_write_out: bool, target_model: str, tasks: List[str]) -> Dict[str, SingleTaskResult]:
    """tasks に含まれるタスクの結果を読み込む"""
    result_files = [f"{result_dir}/{task}.json" for task in tasks]
    results: Dict[str, SingleTaskResult] = {}
    for result_file in result_files:
        with open(result_file) as f:
            result = json.load(f)

        task_name = os.path.basename(result_file).replace(".json", "")
        task_scores = result["results"]
        config = result["config"]
        write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None

        results[task_name] = SingleTaskResult(
            results=task_scores,
            result_json_file=result_file,
            write_out_dir=write_out_dir,
            config=config,
        )
    return results


def upload_wandb(
    data: Dict[str, SingleTaskResult],
    entity_name: str,
    project_name: str,
    target_model: str,
    elapsed_time: int,
) -> None:
    """wandb に Upload を試みる"""
    def get_run(entity_name: str, project_name: str, target_model: str) -> wandb.apis.public.Run:
        api = wandb.Api()
        runs = api.runs(f"{entity_name}/{project_name}", filters={"config.target_model": target_model})
        assert len(runs) == 1, f"len(runs) must be 1, but {len(runs)}"
        run = list(runs)[0]
        return run

    def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
        """lm-evaluation-harness で出力された結果を平均して単一の値にする"""
        scores = []
        for _, metric_score in lm_evaluation_results.items():
            target_metric = TASK_METRIC_MAPPING[task_name]
            score: float = metric_score[target_metric]
            if math.isnan(score):
                warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
            scores.append(score)
        return sum(scores) / len(scores)

    def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
        """lm-evaulation-harness の出力ディレクトリを抽出する"""
        if data_type == "result":
            return os.path.dirname(data[list(data.keys())[0]].result_json_file)
        elif data_type == "output":
            return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
        else:
            raise ValueError(f"Invalid data_type: {data_type}")

    def get_new_average(updated_table: wandb.Table) -> float:
        """average を再計算する

        除くカラム: model_name, Average, Elapsed Time と nan と None が含まれるカラム
            wandb は nan の値を None として扱うため、None が含まれるカラムも除く
        """
        exclude_columns = ["model_name", "Average", "Elapsed Time"]
        target_scores = updated_table.data[0]
        target_scores = [
            score
            for column, score in zip(updated_table.columns, target_scores)
            if column not in exclude_columns and score is not None and not math.isnan(score)
        ]
        return sum(target_scores) / len(target_scores)

    run = get_run(entity_name, project_name, target_model)
    run_id = run.id
    run_config = json.loads(run.json_config)  # {Key: {"value": value}, {"desc": desc}, ...}
    wandb_config = {key: value["value"] for key, value in run_config.items()}
    with wandb.init(id=run_id, project=project_name, entity=entity_name, config=wandb_config) as run:
        # get table
        table_name = WANDB_TABLE_NAME.replace("-", "")  # wandb.Table に渡すときに - が消えるので、wandb.Table から取得するときには - を消す
        artifact_name = f"{entity_name}/{project_name}/run-{run.id}-{table_name}:latest"
        api_artifact = wandb.Api().artifact(artifact_name)
        table = run.use_artifact(api_artifact).get(WANDB_TABLE_NAME)

        # update average, elapsed time and task
        updated_table = wandb.Table(columns=table.columns, data=table.data)
        column2index = {column: index for index, column in enumerate(updated_table.columns)}

        updated_table.data[0][column2index["Elapsed Time"]] += elapsed_time

        for task_name, task_result in data.items():
            updated_table.add_column(task_name, [post_process_results(task_result.results, task_name)])

        # average の再計算
        # model_name と古い Average と elapsed_time を除いて、average を計算する
        updated_table.data[0][column2index["Average"]] = get_new_average(updated_table)

        # update table
        run.log({WANDB_TABLE_NAME: updated_table})

        # save artifact
        result_json_dir = extract_dir_info(data, data_type="result")
        result_artifact_name = target_model.replace("/", ".") + ".result"
        result_artifact = wandb.Artifact(
            result_artifact_name, type="lm-evaluation-harness-result"
        )
        result_artifact.add_dir(result_json_dir)
        wandb.log_artifact(result_artifact)
        if task_result.write_out_dir is not None:
            result_output_dir = extract_dir_info(data, data_type="output")
            output_artifact_name = target_model.replace("/", ".") + ".output"
            output_artifact = wandb.Artifact(
                output_artifact_name, type="lm-evaluation-harness-output"
            )
            output_artifact.add_dir(result_output_dir)
            wandb.log_artifact(output_artifact)

    print("Finish Upload.")


def main():
    args = get_args()

    # 追加で実施された lm-evaluation-harness の結果を読み込む
    data = load_results(args.result_dir, args.is_write_out, args.target_model, args.tasks)

    # wandb に Upload する
    upload_wandb(
        data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.elapsed_time
    )


if __name__ == "__main__":
    main()

## run_open_llm_leaderboard.sh
#!/bin/bash
set -eu

export HF_HOME=  # Need setting

WANDB_ENTITY=  # Need setting
WANDB_PROJECT=  # Need setting

WRITE_OUT_HARNESS=""
WRITE_OUT_PATH=""
WRITE_OUT_WANDB=""
PEFT_BASE_MODEL=""
while getopts "wl:" OPT
do
    case $OPT in
        w)
            WRITE_OUT_HARNESS="--write_out --output_base_path"
            WRITE_OUT_PATH="./write_out"
            WRITE_OUT_WANDB="--is_write_out"
            ;;
        l)
            PEFT_BASE_MODEL="${OPTARG}";;
    esac
done
shift $((OPTIND - 1))

target_model=$1
batch_size=$2
result_dir=$3

n_shot_task=(
"arc-challenge 25 arc_challenge"
"hellaswag 10 hellaswag"
"truthfulqa-mc 0 truthfulqa_mc"
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
"winogrande 5 winogrande"
"gsm8k 5 gsm8k"
"drop 3 drop"
)

echo "Target model: ${target_model}"
echo "Batch size: ${batch_size}"
echo "Result dir: ${result_dir}"

if [ "${PEFT_BASE_MODEL}" != "" ]; then
    echo "PEFT base model: ${PEFT_BASE_MODEL}"
    HARNESS_MODEL_TYPE="hf-causal-experimental"
    HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True"
else
    # default
    HARNESS_MODEL_TYPE="hf-causal"
    HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True"
fi

WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model}
mkdir -p ${WRITE_OUT_PATH}
start_time=`date +%s`
for current_n_shot_task in "${n_shot_task[@]}"
do
        # task-name number-of-shot task-list
        current_task=(${current_n_shot_task})
        echo "---"
        echo "task name: ${current_task[0]}"
        echo "n-shot: ${current_task[1]}"
        echo "task list: ${current_task[2]}"

        if [ "${WRITE_OUT_HARNESS}" != "" ]; then
            WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]}
        fi

        output_path="${result_dir}/${current_task[0]}".json
        python main.py \
            --model ${HARNESS_MODEL_TYPE} \
            --model_args ${HARNESS_MODEL_ARGS} \
            --num_fewshot ${current_task[1]} \
            --tasks ${current_task[2]} \
            --batch_size ${batch_size} \
            --output_path ${output_path} \
            ${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH}

        if [ "${WRITE_OUT_HARNESS}" != "" ]; then
            WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}`
        fi
done
end_time=`date +%s`
elapsed_time=$((end_time - start_time))

commit_id=`git show --format='%h' --no-patch`

python save_wandb.py \
  --result_dir ${result_dir}\
  --target_model ${target_model} \
  --batch_size ${batch_size} \
  --commit_id ${commit_id} \
  --elapsed_time ${elapsed_time} \
  --wandb_entity_name ${WANDB_ENTITY} \
  --wandb_project_name ${WANDB_PROJECT} \
  ${WRITE_OUT_WANDB}

exit

## run_open_llm_leaderboard_add_task.sh
#!/bin/bash
set -eu

export HF_HOME=  # Need setting

WANDB_ENTITY=  # Need setting
WANDB_PROJECT=  # Need setting

WRITE_OUT_HARNESS=""
WRITE_OUT_PATH=""
WRITE_OUT_WANDB=""
PEFT_BASE_MODEL=""
while getopts "wl:" OPT
do
    case $OPT in
        w)
            WRITE_OUT_HARNESS="--write_out --output_base_path"
            WRITE_OUT_PATH="./write_out"
            WRITE_OUT_WANDB="--is_write_out"
            ;;
        l)
            PEFT_BASE_MODEL="${OPTARG}";;
    esac
done
shift $((OPTIND - 1))

target_model=$1
batch_size=$2
result_dir=$3

# task-name number-of-shot task-list
n_shot_task=(
"arc-challenge 25 arc_challenge"
"hellaswag 10 hellaswag"
"truthfulqa-mc 0 truthfulqa_mc"
"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
"winogrande 5 winogrande"
"gsm8k 5 gsm8k"
"drop 3 drop"
)

echo "Target model: ${target_model}"
echo "Batch size: ${batch_size}"
echo "Result dir: ${result_dir}"

if [ "${PEFT_BASE_MODEL}" != "" ]; then
    echo "PEFT base model: ${PEFT_BASE_MODEL}"
    HARNESS_MODEL_TYPE="hf-causal-experimental"
    HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True"
else
    # default
    HARNESS_MODEL_TYPE="hf-causal"
    HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True"
fi

WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model}
mkdir -p ${WRITE_OUT_PATH}
start_time=`date +%s`
done_task=""
for current_n_shot_task in "${n_shot_task[@]}"
do
        # task-name number-of-shot task-list
        current_task=(${current_n_shot_task})
        echo "---"
        echo "task name: ${current_task[0]}"
        echo "n-shot: ${current_task[1]}"
        echo "task list: ${current_task[2]}"

        if [ "${WRITE_OUT_HARNESS}" != "" ]; then
            WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]}
        fi

        output_path="${result_dir}/${current_task[0]}".json
        python main.py \
            --model ${HARNESS_MODEL_TYPE} \
            --model_args ${HARNESS_MODEL_ARGS} \
            --num_fewshot ${current_task[1]} \
            --tasks ${current_task[2]} \
            --batch_size ${batch_size} \
            --output_path ${output_path} \
            ${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH}

        if [ "${WRITE_OUT_HARNESS}" != "" ]; then
            WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}`
        fi
        done_task="${done_task} ${current_task[0]}"
done
end_time=`date +%s`
elapsed_time=$((end_time - start_time))

echo "Tasks: ${done_task}"
python additional_save_wandb.py \
  --result_dir ${result_dir}\
  --target_model ${target_model} \
  --elapsed_time ${elapsed_time} \
  --wandb_entity_name ${WANDB_ENTITY} \
  --wandb_project_name ${WANDB_PROJECT} \
  --tasks ${done_task} \
  ${WRITE_OUT_WANDB}

exit

## save_wandb.py
"""lm-evaluation-harness の結果を wandb にアップロードするスニペット

アップロードする項目は次の通り:
- テストデータに対する各指標
- lm-evaluation-harness の json ファイル一覧
- lm-evaluation-harness の commit-id
- lm-evaluation-harness の実行時間
- lm-evaluation-harness の引数（評価対象のモデル、バッチサイズ）
- lm-evaluation-hanress の write_out 出力ファイル
"""
import argparse
import json
import glob
import os
import math
import warnings
from dataclasses import dataclass
from typing import Dict, Optional

import wandb


WANDB_TABLE_NAME = "open-llm-leaderboard"
TASK_METRIC_MAPPING = {  # https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
    "arc-challenge": "acc_norm",
    "hellaswag": "acc_norm",
    "truthfulqa-mc": "mc2",
    "mmlu": "acc",
    "winogrande": "acc",
    "gsm8k": "acc",
    "drop": "f1",
}


@dataclass
class SingleTaskResult:
    results: Dict[str, Dict[str, float]]
    result_json_file: str
    write_out_dir: Optional[str]
    config: Dict


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
    parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
    parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
    parser.add_argument("--batch_size", type=int, required=True, help="評価時のバッチサイズ")
    parser.add_argument("--commit_id", type=str, required=True, help="lm-evaluation-harness の commit-id")
    parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の実行時間 [sec]")
    parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
    parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
    return parser.parse_args()


def load_results(result_dir: str, is_write_out: bool, target_model: str) -> Dict[str, SingleTaskResult]:
    """lm-evaluation-harness の結果を読み込む

    Returns:
        Dict[str, SingleTaskResult]: タスク名をキーとした結果の辞書
    """
    result_files = glob.glob(f"{result_dir}/*.json")
    results: Dict[str, SingleTaskResult] = {}
    for result_file in result_files:
        with open(result_file) as f:
            result = json.load(f)

        task_name = os.path.basename(result_file).replace(".json", "")
        task_scores = result["results"]
        config = result["config"]
        write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None

        results[task_name] = SingleTaskResult(
            results=task_scores,
            result_json_file=result_file,
            write_out_dir=write_out_dir,
            config=config,
        )
    return results


def upload_wandb(
    data: Dict[str, SingleTaskResult],
    entity_name: str,
    project_name: str,
    target_model: str,
    commit_id: str,
    elapsed_time: int,
    batch_size: int,
) -> None:
    """wandb に Upload を試みる

    wandb に保存する内容は次の通り
    - テストデータに対する各指標（avg 含む）: Table として保存する
    - lm-evaluation-harness の json ファイル一覧: artifact として保存する
    - lm-evaluation-harness の commit-id: config
    - lm-evaluation-harness の実行時間: Table
    - lm-evaluation-harness の引数（評価対象のモデル、バッチサイズ）: config（モデル名は Table にも保存）
    - lm-evaluation-hanress の write_out 出力ファイル: artifact
    """
    def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
        """lm-evaluation-harness で出力された結果を平均を利用して単一の float 値にする"""
        scores = []  # 設定した評価指標のスコアが格納される
        for _, metric_score in lm_evaluation_results.items():
            target_metric = TASK_METRIC_MAPPING[task_name]
            score: float = metric_score[target_metric]
            if math.isnan(score):
                warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
            scores.append(score)
        return sum(scores) / len(scores)

    def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
        """lm-evaluation-harness の出力ディレクトリを抽出する"""
        if data_type == "result":
            return os.path.dirname(data[list(data.keys())[0]].result_json_file)
        elif data_type == "output":
            return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
        else:
            raise ValueError(f"Invalid data_type: {data_type}")

    wandb_config = {
        "lm_evaluation_harness_commit_id": commit_id,
        "target_model": target_model,
        "batch_size": batch_size,
    }
    with wandb.init(
        entity=entity_name, project=project_name, name=target_model, config=wandb_config
    ) as run:
        columns = ["model_name", "Average"]  # model_name + avg_score + task_name + elapsed_time
        items = [target_model]  # model_name + avg_score (insert) + task_name + elapsed_time
        for task_name, result in data.items():
            # make table
            columns.append(task_name)
            result_for_table = post_process_results(result.results, task_name)
            items.append(result_for_table)

        scores_without_nan = [score for score in items[1:] if not math.isnan(score)]
        avg_score = sum(scores_without_nan) / len(scores_without_nan)
        items.insert(1, avg_score)
        # Add time column
        columns.append("Elapsed Time")
        items.append(elapsed_time)

        # Add 1D array and save table
        lm_evaluation_harness_table = wandb.Table(columns=columns, data=[items])
        run.log({WANDB_TABLE_NAME: lm_evaluation_harness_table})

        # save artifact
        result_json_dir = extract_dir_info(data, data_type="result")
        result_artifact_name = target_model.replace("/", ".") + ".result"
        result_artifact = wandb.Artifact(
            result_artifact_name, type="lm-evaluation-harness-result"
        )
        result_artifact.add_dir(result_json_dir)
        wandb.log_artifact(result_artifact)
        if result.write_out_dir is not None:
            result_output_dir = extract_dir_info(data, data_type="output")
            output_artifact_name = target_model.replace("/", ".") + ".output"
            output_artifact = wandb.Artifact(
                output_artifact_name, type="lm-evaluation-harness-output"
            )
            output_artifact.add_dir(result_output_dir)
            wandb.log_artifact(output_artifact)

    print("Finish Upload.")


def main():
    args = get_args()

    # lm-evaluation-harness の結果を読み込む
    data = load_results(args.result_dir, args.is_write_out, args.target_model)

    # wandb にアップロードする
    upload_wandb(
        data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.commit_id, args.elapsed_time, args.batch_size
    )


if __name__ == "__main__":
    main()
	"""追加で実施した lm-evaluation-harness の結果を wandb に Upload する

	注意事項:
	- batch_size, commit_id は、lm-evaluation-harness の実行時のものを指定すること
	- is_write_out もできれば lm-evaluation-harness の実行時のものを指定すること
	- average は追加したタスクを反映させた結果が上書きされる
	- artifact は追加で実施した lm-evaluation-harness の結果のみ Upload される（ただし、以前に実行した結果がローカルに残っている場合は、それも Upload される）
	- 古い結果は wandb の UI 上で version を選択して確認する
	"""
	import argparse
	import json
	import os
	import math
	import warnings
	from typing import List, Dict

	import wandb

	from save_wandb import WANDB_TABLE_NAME, TASK_METRIC_MAPPING, SingleTaskResult


	def get_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser()
	parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
	parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
	parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
	parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の追加実行時間 [sec]")
	parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
	parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
	parser.add_argument("--tasks", nargs="+", required=True, help="追加評価対象のタスク名")
	return parser.parse_args()


	def load_results(result_dir: str, is_write_out: bool, target_model: str, tasks: List[str]) -> Dict[str, SingleTaskResult]:
	"""tasks に含まれるタスクの結果を読み込む"""
	result_files = [f"{result_dir}/{task}.json" for task in tasks]
	results: Dict[str, SingleTaskResult] = {}
	for result_file in result_files:
	with open(result_file) as f:
	result = json.load(f)

	task_name = os.path.basename(result_file).replace(".json", "")
	task_scores = result["results"]
	config = result["config"]
	write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None

	results[task_name] = SingleTaskResult(
	results=task_scores,
	result_json_file=result_file,
	write_out_dir=write_out_dir,
	config=config,
	)
	return results


	def upload_wandb(
	data: Dict[str, SingleTaskResult],
	entity_name: str,
	project_name: str,
	target_model: str,
	elapsed_time: int,
	) -> None:
	"""wandb に Upload を試みる"""
	def get_run(entity_name: str, project_name: str, target_model: str) -> wandb.apis.public.Run:
	api = wandb.Api()
	runs = api.runs(f"{entity_name}/{project_name}", filters={"config.target_model": target_model})
	assert len(runs) == 1, f"len(runs) must be 1, but {len(runs)}"
	run = list(runs)[0]
	return run

	def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
	"""lm-evaluation-harness で出力された結果を平均して単一の値にする"""
	scores = []
	for _, metric_score in lm_evaluation_results.items():
	target_metric = TASK_METRIC_MAPPING[task_name]
	score: float = metric_score[target_metric]
	if math.isnan(score):
	warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
	scores.append(score)
	return sum(scores) / len(scores)

	def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
	"""lm-evaulation-harness の出力ディレクトリを抽出する"""
	if data_type == "result":
	return os.path.dirname(data[list(data.keys())[0]].result_json_file)
	elif data_type == "output":
	return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
	else:
	raise ValueError(f"Invalid data_type: {data_type}")

	def get_new_average(updated_table: wandb.Table) -> float:
	"""average を再計算する

	除くカラム: model_name, Average, Elapsed Time と nan と None が含まれるカラム
	wandb は nan の値を None として扱うため、None が含まれるカラムも除く
	"""
	exclude_columns = ["model_name", "Average", "Elapsed Time"]
	target_scores = updated_table.data[0]
	target_scores = [
	score
	for column, score in zip(updated_table.columns, target_scores)
	if column not in exclude_columns and score is not None and not math.isnan(score)
	]
	return sum(target_scores) / len(target_scores)

	run = get_run(entity_name, project_name, target_model)
	run_id = run.id
	run_config = json.loads(run.json_config) # {Key: {"value": value}, {"desc": desc}, ...}
	wandb_config = {key: value["value"] for key, value in run_config.items()}
	with wandb.init(id=run_id, project=project_name, entity=entity_name, config=wandb_config) as run:
	# get table
	table_name = WANDB_TABLE_NAME.replace("-", "") # wandb.Table に渡すときに - が消えるので、wandb.Table から取得するときには - を消す
	artifact_name = f"{entity_name}/{project_name}/run-{run.id}-{table_name}:latest"
	api_artifact = wandb.Api().artifact(artifact_name)
	table = run.use_artifact(api_artifact).get(WANDB_TABLE_NAME)

	# update average, elapsed time and task
	updated_table = wandb.Table(columns=table.columns, data=table.data)
	column2index = {column: index for index, column in enumerate(updated_table.columns)}

	updated_table.data[0][column2index["Elapsed Time"]] += elapsed_time

	for task_name, task_result in data.items():
	updated_table.add_column(task_name, [post_process_results(task_result.results, task_name)])

	# average の再計算
	# model_name と古い Average と elapsed_time を除いて、average を計算する
	updated_table.data[0][column2index["Average"]] = get_new_average(updated_table)

	# update table
	run.log({WANDB_TABLE_NAME: updated_table})

	# save artifact
	result_json_dir = extract_dir_info(data, data_type="result")
	result_artifact_name = target_model.replace("/", ".") + ".result"
	result_artifact = wandb.Artifact(
	result_artifact_name, type="lm-evaluation-harness-result"
	)
	result_artifact.add_dir(result_json_dir)
	wandb.log_artifact(result_artifact)
	if task_result.write_out_dir is not None:
	result_output_dir = extract_dir_info(data, data_type="output")
	output_artifact_name = target_model.replace("/", ".") + ".output"
	output_artifact = wandb.Artifact(
	output_artifact_name, type="lm-evaluation-harness-output"
	)
	output_artifact.add_dir(result_output_dir)
	wandb.log_artifact(output_artifact)

	print("Finish Upload.")



	def main():
	args = get_args()

	# 追加で実施された lm-evaluation-harness の結果を読み込む
	data = load_results(args.result_dir, args.is_write_out, args.target_model, args.tasks)

	# wandb に Upload する
	upload_wandb(
	data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.elapsed_time
	)


	if __name__ == "__main__":
	main()
	#!/bin/bash
	set -eu

	export HF_HOME= # Need setting

	WANDB_ENTITY= # Need setting
	WANDB_PROJECT= # Need setting

	WRITE_OUT_HARNESS=""
	WRITE_OUT_PATH=""
	WRITE_OUT_WANDB=""
	PEFT_BASE_MODEL=""
	while getopts "wl:" OPT
	do
	case $OPT in
	w)
	WRITE_OUT_HARNESS="--write_out --output_base_path"
	WRITE_OUT_PATH="./write_out"
	WRITE_OUT_WANDB="--is_write_out"
	;;
	l)
	PEFT_BASE_MODEL="${OPTARG}";;
	esac
	done
	shift $((OPTIND - 1))

	target_model=$1
	batch_size=$2
	result_dir=$3

	n_shot_task=(
	"arc-challenge 25 arc_challenge"
	"hellaswag 10 hellaswag"
	"truthfulqa-mc 0 truthfulqa_mc"
	"mmlu 5 hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
	"winogrande 5 winogrande"
	"gsm8k 5 gsm8k"
	"drop 3 drop"
	)

	echo "Target model: ${target_model}"
	echo "Batch size: ${batch_size}"
	echo "Result dir: ${result_dir}"

	if [ "${PEFT_BASE_MODEL}" != "" ]; then
	echo "PEFT base model: ${PEFT_BASE_MODEL}"
	HARNESS_MODEL_TYPE="hf-causal-experimental"
	HARNESS_MODEL_ARGS="pretrained=${PEFT_BASE_MODEL},revision=main,peft=${target_model},trust_remote_code=True"
	else
	# default
	HARNESS_MODEL_TYPE="hf-causal"
	HARNESS_MODEL_ARGS="pretrained=${target_model},revision=main,trust_remote_code=True"
	fi

	WRITE_OUT_PATH=${WRITE_OUT_PATH}/${target_model}
	mkdir -p ${WRITE_OUT_PATH}
	start_time=`date +%s`
	for current_n_shot_task in "${n_shot_task[@]}"
	do
	# task-name number-of-shot task-list
	current_task=(${current_n_shot_task})
	echo "---"
	echo "task name: ${current_task[0]}"
	echo "n-shot: ${current_task[1]}"
	echo "task list: ${current_task[2]}"

	if [ "${WRITE_OUT_HARNESS}" != "" ]; then
	WRITE_OUT_PATH=${WRITE_OUT_PATH}/${current_task[0]}
	fi

	output_path="${result_dir}/${current_task[0]}".json
	python main.py \
	--model ${HARNESS_MODEL_TYPE} \
	--model_args ${HARNESS_MODEL_ARGS} \
	--num_fewshot ${current_task[1]} \
	--tasks ${current_task[2]} \
	--batch_size ${batch_size} \
	--output_path ${output_path} \
	${WRITE_OUT_HARNESS} ${WRITE_OUT_PATH}

	if [ "${WRITE_OUT_HARNESS}" != "" ]; then
	WRITE_OUT_PATH=`dirname ${WRITE_OUT_PATH}`
	fi
	done
	end_time=`date +%s`
	elapsed_time=$((end_time - start_time))

	commit_id=`git show --format='%h' --no-patch`

	python save_wandb.py \
	--result_dir ${result_dir}\
	--target_model ${target_model} \
	--batch_size ${batch_size} \
	--commit_id ${commit_id} \
	--elapsed_time ${elapsed_time} \
	--wandb_entity_name ${WANDB_ENTITY} \
	--wandb_project_name ${WANDB_PROJECT} \
	${WRITE_OUT_WANDB}

	exit
	"""lm-evaluation-harness の結果を wandb にアップロードするスニペット

	アップロードする項目は次の通り:
	- テストデータに対する各指標
	- lm-evaluation-harness の json ファイル一覧
	- lm-evaluation-harness の commit-id
	- lm-evaluation-harness の実行時間
	- lm-evaluation-harness の引数（評価対象のモデル、バッチサイズ）
	- lm-evaluation-hanress の write_out 出力ファイル
	"""
	import argparse
	import json
	import glob
	import os
	import math
	import warnings
	from dataclasses import dataclass
	from typing import Dict, Optional

	import wandb


	WANDB_TABLE_NAME = "open-llm-leaderboard"
	TASK_METRIC_MAPPING = { # https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
	"arc-challenge": "acc_norm",
	"hellaswag": "acc_norm",
	"truthfulqa-mc": "mc2",
	"mmlu": "acc",
	"winogrande": "acc",
	"gsm8k": "acc",
	"drop": "f1",
	}


	@dataclass
	class SingleTaskResult:
	results: Dict[str, Dict[str, float]]
	result_json_file: str
	write_out_dir: Optional[str]
	config: Dict


	def get_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser()
	parser.add_argument("--result_dir", type=str, required=True, help="lm-evaluation-harness の評価結果出力ディレクトリ")
	parser.add_argument("--is_write_out", action="store_true", help="lm-evaluation-harness の write_out を保存するかどうか")
	parser.add_argument("--target_model", type=str, required=True, help="評価対象のモデル")
	parser.add_argument("--batch_size", type=int, required=True, help="評価時のバッチサイズ")
	parser.add_argument("--commit_id", type=str, required=True, help="lm-evaluation-harness の commit-id")
	parser.add_argument("--elapsed_time", type=int, required=True, help="lm-evaluation-harness の実行時間 [sec]")
	parser.add_argument("--wandb_entity_name", required=True, help="WandB の Entity 名")
	parser.add_argument("--wandb_project_name", required=True, help="WandB の Project 名")
	return parser.parse_args()


	def load_results(result_dir: str, is_write_out: bool, target_model: str) -> Dict[str, SingleTaskResult]:
	"""lm-evaluation-harness の結果を読み込む

	Returns:
	Dict[str, SingleTaskResult]: タスク名をキーとした結果の辞書
	"""
	result_files = glob.glob(f"{result_dir}/*.json")
	results: Dict[str, SingleTaskResult] = {}
	for result_file in result_files:
	with open(result_file) as f:
	result = json.load(f)

	task_name = os.path.basename(result_file).replace(".json", "")
	task_scores = result["results"]
	config = result["config"]
	write_out_dir = os.path.join("write_out", target_model, task_name) if is_write_out else None

	results[task_name] = SingleTaskResult(
	results=task_scores,
	result_json_file=result_file,
	write_out_dir=write_out_dir,
	config=config,
	)
	return results


	def upload_wandb(
	data: Dict[str, SingleTaskResult],
	entity_name: str,
	project_name: str,
	target_model: str,
	commit_id: str,
	elapsed_time: int,
	batch_size: int,
	) -> None:
	"""wandb に Upload を試みる

	wandb に保存する内容は次の通り
	- テストデータに対する各指標（avg 含む）: Table として保存する
	- lm-evaluation-harness の json ファイル一覧: artifact として保存する
	- lm-evaluation-harness の commit-id: config
	- lm-evaluation-harness の実行時間: Table
	- lm-evaluation-harness の引数（評価対象のモデル、バッチサイズ）: config（モデル名は Table にも保存）
	- lm-evaluation-hanress の write_out 出力ファイル: artifact
	"""
	def post_process_results(lm_evaluation_results: Dict[str, Dict[str, float]], task_name: str) -> float:
	"""lm-evaluation-harness で出力された結果を平均を利用して単一の float 値にする"""
	scores = [] # 設定した評価指標のスコアが格納される
	for _, metric_score in lm_evaluation_results.items():
	target_metric = TASK_METRIC_MAPPING[task_name]
	score: float = metric_score[target_metric]
	if math.isnan(score):
	warnings.warn(f"Task: {task_name} において、指標 {target_metric} が NaN でした。")
	scores.append(score)
	return sum(scores) / len(scores)

	def extract_dir_info(data: Dict[str, SingleTaskResult], data_type: str) -> str:
	"""lm-evaluation-harness の出力ディレクトリを抽出する"""
	if data_type == "result":
	return os.path.dirname(data[list(data.keys())[0]].result_json_file)
	elif data_type == "output":
	return os.path.dirname(data[list(data.keys())[0]].write_out_dir)
	else:
	raise ValueError(f"Invalid data_type: {data_type}")

	wandb_config = {
	"lm_evaluation_harness_commit_id": commit_id,
	"target_model": target_model,
	"batch_size": batch_size,
	}
	with wandb.init(
	entity=entity_name, project=project_name, name=target_model, config=wandb_config
	) as run:
	columns = ["model_name", "Average"] # model_name + avg_score + task_name + elapsed_time
	items = [target_model] # model_name + avg_score (insert) + task_name + elapsed_time
	for task_name, result in data.items():
	# make table
	columns.append(task_name)
	result_for_table = post_process_results(result.results, task_name)
	items.append(result_for_table)

	scores_without_nan = [score for score in items[1:] if not math.isnan(score)]
	avg_score = sum(scores_without_nan) / len(scores_without_nan)
	items.insert(1, avg_score)
	# Add time column
	columns.append("Elapsed Time")
	items.append(elapsed_time)

	# Add 1D array and save table
	lm_evaluation_harness_table = wandb.Table(columns=columns, data=[items])
	run.log({WANDB_TABLE_NAME: lm_evaluation_harness_table})

	# save artifact
	result_json_dir = extract_dir_info(data, data_type="result")
	result_artifact_name = target_model.replace("/", ".") + ".result"
	result_artifact = wandb.Artifact(
	result_artifact_name, type="lm-evaluation-harness-result"
	)
	result_artifact.add_dir(result_json_dir)
	wandb.log_artifact(result_artifact)
	if result.write_out_dir is not None:
	result_output_dir = extract_dir_info(data, data_type="output")
	output_artifact_name = target_model.replace("/", ".") + ".output"
	output_artifact = wandb.Artifact(
	output_artifact_name, type="lm-evaluation-harness-output"
	)
	output_artifact.add_dir(result_output_dir)
	wandb.log_artifact(output_artifact)

	print("Finish Upload.")


	def main():
	args = get_args()

	# lm-evaluation-harness の結果を読み込む
	data = load_results(args.result_dir, args.is_write_out, args.target_model)

	# wandb にアップロードする
	upload_wandb(
	data, args.wandb_entity_name, args.wandb_project_name, args.target_model, args.commit_id, args.elapsed_time, args.batch_size
	)


	if __name__ == "__main__":
	main()