This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Corpus BLEU with arguments | |
# Run this file from CMD/Terminal | |
# Example Command: python3 compute-bleu-args.py test_file_name.txt mt_file_name.txt | |
import sys | |
import sacrebleu | |
from sacremoses import MosesDetokenizer | |
md = MosesDetokenizer(lang='en') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sentence WER | |
# WER for segment by segment with arguments | |
# Run this file from CMD/Terminal | |
# Example Command: python3 sentence-wer.py test_file_name.txt mt_file_name.txt | |
import sys | |
from jiwer import wer | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Corpus WER | |
# WER score for the whole corpus | |
# Run this file from CMD/Terminal | |
# Example Command: python3 corpus-wer.py test_file_name.txt mt_file_name.txt | |
import sys | |
from jiwer import wer | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# BLEU for segment by segment | |
import sacrebleu | |
from sacremoses import MosesDetokenizer | |
md = MosesDetokenizer(lang='en') | |
# Open the test dataset human translation file and detokenize the references | |
refs = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# BLEU for segment by segment with arguments | |
# Run this file from CMD/Terminal | |
# Example Command: python3 compute-bleu-sentence-args.py test_file_name.txt mt_file_name.txt | |
import sys | |
import sacrebleu | |
from sacremoses import MosesDetokenizer | |
md = MosesDetokenizer(lang='en') | |
target_test = sys.argv[1] # Test file argument |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sacrebleu | |
from sacremoses import MosesDetokenizer | |
md = MosesDetokenizer(lang='en') | |
# Open the test dataset human translation file and detokenize the references | |
refs = [] | |
with open("target.test") as test: | |
for line in test: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ctranslate2 | |
def detokenize(result): | |
translation = " ".join([t for t in result]) | |
return translation | |
def tokenize(input_sentence): | |
tokens = input_sentence.split(" ") | |
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#OpenNMT-py GUI Alpha version by Yasmin Moslem | |
#Contact: yasmin {aatt} machinetranslation.io | |
#Built on OpenNMT-py v. 0.9.1 "translate.py" | |
from __future__ import unicode_literals | |
from itertools import repeat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://webz.io/free-datasets/ | |
# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip | |
# Extract text from the JSON files | |
import os | |
import json | |
from sentence_splitter import split_text_into_sentences | |
from tqdm import tqdm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install transformers sentencepiece torch -U -q | |
# Replace "test_source.txt" with your source file. | |
# Change src_lang, tgt_lang, and lang_code_to_id to the source and target languages you need. | |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
import torch | |
from tqdm import tqdm | |
# Function to split source lines into chunks to avoid out-of-memory errors |
OlderNewer