Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import argparse
import pandas as pd
from datasets import load_metric
from transformers import AutoTokenizer
TOKENIZER_MODEL = 'cl-tohoku/bert-large-japanese'
def main(args):
df_ref = pd.read_csv(args.ref_file)
df_pred = pd.read_table(args.pred_file, names=['label'])
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
def to_ids(text):
ids = tokenizer.encode(text, add_special_tokens=False)
return ' '.join([str(id) for id in ids])
refs = list(df_ref['label'].apply(to_ids))
preds = list(df_pred['label'].apply(to_ids))
rouge = load_metric("rouge")
rouge_output = rouge.compute(predictions=preds, references=refs)
for k,v in rouge_output.items():
print(k, v.mid.fmeasure * 100)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--ref-file', required=True)
parser.add_argument('--pred-file', required=True)
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment