danyaljj/wmt21.yaml

## wmt21.yaml
show_unpublished_scores: true
datasets:
  blind_labels: danielk/genie_labels
evaluator:
  image: jbragg/genie-evaluator
  input_path: /preds/
  predictions_filename: predictions.json
  label_path: /labels/
  output_path: /results
  arguments:
    - python
    - /app/evaluator.py
    - '--gold_label_file'
    - /labels/wmt21_gold_outputs.json
    - '--prediction_file'
    - /preds/predictions.json
    - '--output'
    - /results/metrics.json
  # Submissions are ranked by the first metric.
  metrics:
    - key: human_mean
      display_name: "Human"
      supplemental: true
    - key: human_mean_ci_upper
      display_name: "Human (+err)"
      supplemental: true
    - key: human_mean_ci_lower
      display_name: "Human (-err)"
      supplemental: true
    - key: bert_score
      display_name: "BERTScore"
    - key: rouge
      display_name: "ROUGE"
    - key: meteor
      display_name: "METEOR"
    - key: sacrebleu
      display_name: "SacreBLEU"
    - key: bleurt
      display_name: "BLEURT"
metadata:
  tag_ids:
    - ai2
    - new
  logo: /assets/images/leaderboard/genie_mt/logo.svg
  short_name: GENIE - Machine Translation (WMT21)
  long_name: GENeratIve Evaluation (GENIE) - Machine Translation (WMT21)
  description:
    GENIE is an evaluation leaderboard for tasks that involve text generation on a diverse set of tasks.
    It contains a suite of seven existing datasets (direct-answer question answering, translation, common-sense reasoning, paraphrasing, etc).
    The goal of this leaderboard is to provide evaluation for generative tasks and to encourage the AI community
    to think about more difficult challenges.

    This is the Machine Translation leaderboard component of GENIE.
  example: |
    For example, see the [Huggingface dataset explorer](https://huggingface.co/nlp/viewer/?dataset=wmt21&config=cs-en).

    The above link is for the cs-en config, but you should download the de-en config instead.

    You may not use any reference translations from the test split.
  getting_the_data: |
    Download the German-English portion of WMT'21, for example from HuggingFace as
    ```
    import datasets
    data = datasets.load_dataset('wmt21', 'de-en', version='1.0.1')
    ```

  scoring: |
    We measure performance in multiple ways. First, we have common metrics for text
     generation tasks. In addition to the automatic metrics, we compute human scores
    for the predictions by using crowdsourced annotations.

  predictions_format: |
    Your model should produce a `predictions.json` file that is a mapping from id to generated output for the test split.
    Here's an example of the file:
    ```
    {
      "[ID1]": "[OUTPUT1]",
      "[ID2]": "[OUTPUT2]",
      ...
    }
    ```

    If you imported your data using
    ```
    data = datasets.load_dataset('wmt21', 'de-en', version='1.0.1')
    ```
    then the test split is available as: `data['test']`

    You may not use any reference translations from the test split.

    For further details, check [GENIE's website](https://genie.apps.allenai.org/submitting).


  example_models:
  team:
    name: AI2
    description: |
      The human evaluation on leaderboards is a joint project across different teams at the [Allen Institute for AI](https://allenai.org).

  purpose: |
    Part of AI2's mission is to measure the capabilities by state of the art AI systems.
    This leaderboard collects evaluations of current AI systems on various commonsense/reasoning tasks that measure both the knowledge that these systems possess as well as their
    ability to reason with and use that knowledge in context.
  show_compact_table_button: true
  metric_precision: 2
disable_publish_speed_bump: true
metrics_table:
  columns:
    - name: Human
      description: "Human evaluation score computed by GENIE."
      renderer: error
      metric_keys: ["human_mean", "human_mean_ci_upper", "human_mean_ci_lower"]
    - name: BERTScore
      renderer: simple
      metric_keys:
        - bert_score
    - name: ROUGE
      renderer: simple
      metric_keys:
        - rouge
    - name: METEOR
      renderer: simple
      metric_keys:
        - meteor
    - name: SacreBLEU
      renderer: simple
      metric_keys:
        - sacrebleu
    - name: BLEURT
      renderer: simple
      metric_keys:
        - bleurt
	show_unpublished_scores: true
	datasets:
	blind_labels: danielk/genie_labels
	evaluator:
	image: jbragg/genie-evaluator
	input_path: /preds/
	predictions_filename: predictions.json
	label_path: /labels/
	output_path: /results
	arguments:
	- python
	- /app/evaluator.py
	- '--gold_label_file'
	- /labels/wmt21_gold_outputs.json
	- '--prediction_file'
	- /preds/predictions.json
	- '--output'
	- /results/metrics.json
	# Submissions are ranked by the first metric.
	metrics:
	- key: human_mean
	display_name: "Human"
	supplemental: true
	- key: human_mean_ci_upper
	display_name: "Human (+err)"
	supplemental: true
	- key: human_mean_ci_lower
	display_name: "Human (-err)"
	supplemental: true
	- key: bert_score
	display_name: "BERTScore"
	- key: rouge
	display_name: "ROUGE"
	- key: meteor
	display_name: "METEOR"
	- key: sacrebleu
	display_name: "SacreBLEU"
	- key: bleurt
	display_name: "BLEURT"
	metadata:
	tag_ids:
	- ai2
	- new
	logo: /assets/images/leaderboard/genie_mt/logo.svg
	short_name: GENIE - Machine Translation (WMT21)
	long_name: GENeratIve Evaluation (GENIE) - Machine Translation (WMT21)
	description:
	GENIE is an evaluation leaderboard for tasks that involve text generation on a diverse set of tasks.
	It contains a suite of seven existing datasets (direct-answer question answering, translation, common-sense reasoning, paraphrasing, etc).
	The goal of this leaderboard is to provide evaluation for generative tasks and to encourage the AI community
	to think about more difficult challenges.

	This is the Machine Translation leaderboard component of GENIE.
	example: \|
	For example, see the [Huggingface dataset explorer](https://huggingface.co/nlp/viewer/?dataset=wmt21&config=cs-en).

	The above link is for the cs-en config, but you should download the de-en config instead.

	You may not use any reference translations from the test split.
	getting_the_data: \|
	Download the German-English portion of WMT'21, for example from HuggingFace as
	```
	import datasets
	data = datasets.load_dataset('wmt21', 'de-en', version='1.0.1')
	```

	scoring: \|
	We measure performance in multiple ways. First, we have common metrics for text
	generation tasks. In addition to the automatic metrics, we compute human scores
	for the predictions by using crowdsourced annotations.

	predictions_format: \|
	Your model should produce a `predictions.json` file that is a mapping from id to generated output for the test split.
	Here's an example of the file:
	```
	{
	"[ID1]": "[OUTPUT1]",
	"[ID2]": "[OUTPUT2]",
	...
	}
	```

	If you imported your data using
	```
	data = datasets.load_dataset('wmt21', 'de-en', version='1.0.1')
	```
	then the test split is available as: `data['test']`

	You may not use any reference translations from the test split.

	For further details, check [GENIE's website](https://genie.apps.allenai.org/submitting).


	example_models:
	team:
	name: AI2
	description: \|
	The human evaluation on leaderboards is a joint project across different teams at the [Allen Institute for AI](https://allenai.org).

	purpose: \|
	Part of AI2's mission is to measure the capabilities by state of the art AI systems.
	This leaderboard collects evaluations of current AI systems on various commonsense/reasoning tasks that measure both the knowledge that these systems possess as well as their
	ability to reason with and use that knowledge in context.
	show_compact_table_button: true
	metric_precision: 2
	disable_publish_speed_bump: true
	metrics_table:
	columns:
	- name: Human
	description: "Human evaluation score computed by GENIE."
	renderer: error
	metric_keys: ["human_mean", "human_mean_ci_upper", "human_mean_ci_lower"]
	- name: BERTScore
	renderer: simple
	metric_keys:
	- bert_score
	- name: ROUGE
	renderer: simple
	metric_keys:
	- rouge
	- name: METEOR
	renderer: simple
	metric_keys:
	- meteor
	- name: SacreBLEU
	renderer: simple
	metric_keys:
	- sacrebleu
	- name: BLEURT
	renderer: simple
	metric_keys:
	- bleurt