Skip to content

Instantly share code, notes, and snippets.

@goodmami
Created June 27, 2017 01:03
Show Gist options
  • Save goodmami/9e210a80f513f5b6e2f171aabbaee442 to your computer and use it in GitHub Desktop.
Save goodmami/9e210a80f513f5b6e2f171aabbaee442 to your computer and use it in GitHub Desktop.
Simple multi-bleu utility using the NLTK
#!/usr/bin/env python3
# Copyright 2017 Michael Wayne Goodman <goodman.m.w@gmail.com>
# Licensed under the MIT license: https://opensource.org/licenses/MIT
import sys
import os
import gzip
import docopt
from nltk.translate import bleu_score
from nltk.tokenize import (simple, treebank, moses, toktok)
USAGE = '''
Usage: nltk-bleu.py [options] REFERENCE [-]
Arguments:
REFERENCE path to reference sentence file; if the path
doesn't exist but path{0,1,...} do exist,
use them in multi-reference mode
hypothesis the hypothesis lines come from <stdin>
Options:
-h, --help display this help and exit
--lc lowercase the references and hypotheses
--smoothing-method=M apply smoothing method M (0--7; 0 is no smoothing,
3 is NIST) [default: 3]
--tokenizer=T tokenize with T (simple, treebank, moses, toktok)
[default: moses]
For a description of the smoothing methods, see:
http://www.nltk.org/api/nltk.translate.html
For a description of the tokenizers, see:
http://www.nltk.org/api/nltk.tokenize.html
'''
_smoother = bleu_score.SmoothingFunction()
_smoothers = {
'0': _smoother.method0,
'1': _smoother.method1,
'2': _smoother.method2,
'3': _smoother.method3,
'4': _smoother.method4,
'5': _smoother.method5,
'6': _smoother.method6,
'7': _smoother.method7,
}
_tokenizers = {
'simple': simple.SpaceTokenizer(),
'treebank': treebank.TreebankWordTokenizer(),
'moses': moses.MosesTokenizer(),
'toktok': toktok.ToktokTokenizer()
}
def main():
args = docopt.docopt(USAGE)
smoother = _smoothers[args['--smoothing-method']]
tokenizer = _tokenizers[args['--tokenizer']]
refs = []
for suffix in ref_suffixes(args['REFERENCE']):
refs.append(
read(args['REFERENCE'], suffix, args['--lc'], tokenizer)
)
reflen = len(refs[0])
if any(len(reflist) != reflen for reflist in refs):
sys.exit('reference files do not have the same number of lines')
refs = list(zip(*refs))
hyps = [prepare(line, args['--lc'], tokenizer) for line in sys.stdin]
score = bleu_score.corpus_bleu(refs, hyps, smoothing_function=smoother)
print('BLEU: {:4.2f}'.format(score))
def ref_suffixes(stem):
if os.path.isfile(stem):
yield ''
else:
i = 0
while os.path.isfile(stem + str(i)):
yield str(i)
i += 1
def read(stem, suffix, lowercase, tokenizer):
openfile = gzip.open if stem.endswith('.gz') else open
data = []
with openfile(stem + suffix) as f:
data.extend(prepare(line, lowercase, tokenizer) for line in f)
return data
def prepare(line, lowercase, tokenizer):
if lowercase:
line = line.lower()
return tokenizer.tokenize(line)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment