goodmami/nltk-bleu.py

## nltk-bleu.py
#!/usr/bin/env python3

# Copyright 2017 Michael Wayne Goodman <goodman.m.w@gmail.com>
# Licensed under the MIT license: https://opensource.org/licenses/MIT

import sys
import os
import gzip

import docopt

from nltk.translate import bleu_score
from nltk.tokenize import (simple, treebank, moses, toktok)

USAGE = '''
Usage: nltk-bleu.py [options] REFERENCE [-]

Arguments:
  REFERENCE                 path to reference sentence file; if the path
                            doesn't exist but path{0,1,...} do exist,
                            use them in multi-reference mode
  hypothesis                the hypothesis lines come from <stdin>

Options:
  -h, --help                display this help and exit
  --lc                      lowercase the references and hypotheses
  --smoothing-method=M      apply smoothing method M (0--7; 0 is no smoothing,
                            3 is NIST) [default: 3]
  --tokenizer=T             tokenize with T (simple, treebank, moses, toktok)
                            [default: moses]

For a description of the smoothing methods, see:
  http://www.nltk.org/api/nltk.translate.html

For a description of the tokenizers, see:
  http://www.nltk.org/api/nltk.tokenize.html

'''

_smoother = bleu_score.SmoothingFunction()
_smoothers = {
    '0': _smoother.method0,
    '1': _smoother.method1,
    '2': _smoother.method2,
    '3': _smoother.method3,
    '4': _smoother.method4,
    '5': _smoother.method5,
    '6': _smoother.method6,
    '7': _smoother.method7,
}

_tokenizers = {
    'simple': simple.SpaceTokenizer(),
    'treebank': treebank.TreebankWordTokenizer(),
    'moses': moses.MosesTokenizer(),
    'toktok': toktok.ToktokTokenizer()
}


def main():
    args = docopt.docopt(USAGE)

    smoother = _smoothers[args['--smoothing-method']]
    tokenizer = _tokenizers[args['--tokenizer']]

    refs = []
    for suffix in ref_suffixes(args['REFERENCE']):
        refs.append(
            read(args['REFERENCE'], suffix, args['--lc'], tokenizer)
        )

    reflen = len(refs[0])
    if any(len(reflist) != reflen for reflist in refs):
        sys.exit('reference files do not have the same number of lines')
    refs = list(zip(*refs))

    hyps = [prepare(line, args['--lc'], tokenizer) for line in sys.stdin]

    score = bleu_score.corpus_bleu(refs, hyps, smoothing_function=smoother)

    print('BLEU: {:4.2f}'.format(score))


def ref_suffixes(stem):
    if os.path.isfile(stem):
        yield ''
    else:
        i = 0
        while os.path.isfile(stem + str(i)):
            yield str(i)
            i += 1


def read(stem, suffix, lowercase, tokenizer):
    openfile = gzip.open if stem.endswith('.gz') else open
    data = []
    with openfile(stem + suffix) as f:
        data.extend(prepare(line, lowercase, tokenizer) for line in f)
    return data


def prepare(line, lowercase, tokenizer):
    if lowercase:
        line = line.lower()
    return tokenizer.tokenize(line)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	# Copyright 2017 Michael Wayne Goodman <goodman.m.w@gmail.com>
	# Licensed under the MIT license: https://opensource.org/licenses/MIT

	import sys
	import os
	import gzip

	import docopt

	from nltk.translate import bleu_score
	from nltk.tokenize import (simple, treebank, moses, toktok)

	USAGE = '''
	Usage: nltk-bleu.py [options] REFERENCE [-]

	Arguments:
	REFERENCE path to reference sentence file; if the path
	doesn't exist but path{0,1,...} do exist,
	use them in multi-reference mode
	hypothesis the hypothesis lines come from <stdin>

	Options:
	-h, --help display this help and exit
	--lc lowercase the references and hypotheses
	--smoothing-method=M apply smoothing method M (0--7; 0 is no smoothing,
	3 is NIST) [default: 3]
	--tokenizer=T tokenize with T (simple, treebank, moses, toktok)
	[default: moses]

	For a description of the smoothing methods, see:
	http://www.nltk.org/api/nltk.translate.html

	For a description of the tokenizers, see:
	http://www.nltk.org/api/nltk.tokenize.html

	'''

	_smoother = bleu_score.SmoothingFunction()
	_smoothers = {
	'0': _smoother.method0,
	'1': _smoother.method1,
	'2': _smoother.method2,
	'3': _smoother.method3,
	'4': _smoother.method4,
	'5': _smoother.method5,
	'6': _smoother.method6,
	'7': _smoother.method7,
	}

	_tokenizers = {
	'simple': simple.SpaceTokenizer(),
	'treebank': treebank.TreebankWordTokenizer(),
	'moses': moses.MosesTokenizer(),
	'toktok': toktok.ToktokTokenizer()
	}


	def main():
	args = docopt.docopt(USAGE)

	smoother = _smoothers[args['--smoothing-method']]
	tokenizer = _tokenizers[args['--tokenizer']]

	refs = []
	for suffix in ref_suffixes(args['REFERENCE']):
	refs.append(
	read(args['REFERENCE'], suffix, args['--lc'], tokenizer)
	)

	reflen = len(refs[0])
	if any(len(reflist) != reflen for reflist in refs):
	sys.exit('reference files do not have the same number of lines')
	refs = list(zip(*refs))

	hyps = [prepare(line, args['--lc'], tokenizer) for line in sys.stdin]

	score = bleu_score.corpus_bleu(refs, hyps, smoothing_function=smoother)

	print('BLEU: {:4.2f}'.format(score))


	def ref_suffixes(stem):
	if os.path.isfile(stem):
	yield ''
	else:
	i = 0
	while os.path.isfile(stem + str(i)):
	yield str(i)
	i += 1


	def read(stem, suffix, lowercase, tokenizer):
	openfile = gzip.open if stem.endswith('.gz') else open
	data = []
	with openfile(stem + suffix) as f:
	data.extend(prepare(line, lowercase, tokenizer) for line in f)
	return data


	def prepare(line, lowercase, tokenizer):
	if lowercase:
	line = line.lower()
	return tokenizer.tokenize(line)


	if __name__ == '__main__':
	main()