Skip to content

Instantly share code, notes, and snippets.

View ymoslem's full-sized avatar
👩‍🎓

Yasmin Moslem ymoslem

👩‍🎓
View GitHub Profile
@ymoslem
ymoslem / compute-bleu-args.py
Last active February 9, 2020 09:45
Compute BLEU with arguments
# Corpus BLEU with arguments
# Run this file from CMD/Terminal
# Example Command: python3 compute-bleu-args.py test_file_name.txt mt_file_name.txt
import sys
import sacrebleu
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='en')
@ymoslem
ymoslem / sentence-wer.py
Last active March 4, 2020 16:35
Compute WER score for each sentence
# Sentence WER
# WER for segment by segment with arguments
# Run this file from CMD/Terminal
# Example Command: python3 sentence-wer.py test_file_name.txt mt_file_name.txt
import sys
from jiwer import wer
@ymoslem
ymoslem / corpus-wer.py
Last active March 4, 2020 16:36
Compute WER score for the whole dataset
# Corpus WER
# WER score for the whole corpus
# Run this file from CMD/Terminal
# Example Command: python3 corpus-wer.py test_file_name.txt mt_file_name.txt
import sys
from jiwer import wer
@ymoslem
ymoslem / compute-bleu-sentence.py
Last active July 18, 2020 21:23
Calculate BLEU score for sentence by sentence and save the result to a file
# BLEU for segment by segment
import sacrebleu
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='en')
# Open the test dataset human translation file and detokenize the references
refs = []
@ymoslem
ymoslem / compute-bleu-sentence-args.py
Last active July 18, 2020 21:23
Calculate BLEU score for sentence by sentence and save the result to a file, using Python arguments for file names
# BLEU for segment by segment with arguments
# Run this file from CMD/Terminal
# Example Command: python3 compute-bleu-sentence-args.py test_file_name.txt mt_file_name.txt
import sys
import sacrebleu
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='en')
target_test = sys.argv[1] # Test file argument
@ymoslem
ymoslem / compute-bleu.py
Last active February 4, 2021 01:48
Compute BLEU Score for Machine Translation
import sacrebleu
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='en')
# Open the test dataset human translation file and detokenize the references
refs = []
with open("target.test") as test:
for line in test:
@ymoslem
ymoslem / CTranslate2-mwe.py
Created April 20, 2021 12:47
CTranslate2 MWE
import ctranslate2
def detokenize(result):
translation = " ".join([t for t in result])
return translation
def tokenize(input_sentence):
tokens = input_sentence.split(" ")
return tokens
@ymoslem
ymoslem / translate-gui.py
Last active August 25, 2021 18:37
OpenNMT-py Translate GUI
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#OpenNMT-py GUI Alpha version by Yasmin Moslem
#Contact: yasmin {aatt} machinetranslation.io
#Built on OpenNMT-py v. 0.9.1 "translate.py"
from __future__ import unicode_literals
from itertools import repeat
# https://webz.io/free-datasets/
# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip
# Extract text from the JSON files
import os
import json
from sentence_splitter import split_text_into_sentences
from tqdm import tqdm
@ymoslem
ymoslem / mBART-example.py
Last active January 5, 2022 00:59
Use nBART pre-trained multilingual model for translation
#!pip install transformers sentencepiece torch -U -q
# Replace "test_source.txt" with your source file.
# Change src_lang, tgt_lang, and lang_code_to_id to the source and target languages you need.
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from tqdm import tqdm
# Function to split source lines into chunks to avoid out-of-memory errors