Created
December 6, 2018 19:47
-
-
Save h4ste/eb0865caab260472272460269eaae137 to your computer and use it in GitHub Desktop.
Language Utilities from DSRM project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| from collections import Counter | |
| import numpy as np | |
| from six import iterkeys | |
| def truncate_outputs(outputs, eos_id): | |
| """ Truncate a sequence of outputs (vocabulary ids) to end before the end-of-sequence id (eos_id) | |
| :param outputs: sequence of outputs (vocabulary/word ids) | |
| :param eos_id: identifier for end-of-sequence (eos) symbol | |
| :return: truncated sequence of outputs | |
| """ | |
| outputs = np.asarray(outputs) | |
| if eos_id in outputs: | |
| end_index = np.where(outputs == eos_id) | |
| outputs = outputs[:end_index[0][0] + 1] | |
| return outputs | |
| def logits_to_outputs(output_logits, eos_id): | |
| """Convert sequence of logits (e.g., softmax activations) to sequence of outputs (word ids) | |
| :param output_logits: sequence of output logits (neural network activations or weights) | |
| :return: sequence of word ids, truncated to end before the eos_id | |
| """ | |
| outputs = [int(np.argmax(logits)) for logits in output_logits] | |
| return truncate_outputs(outputs, eos_id) | |
| def outputs_to_words(outputs, rev_vocab): | |
| """ Convert a sequence of outputs (word ids) to words | |
| :param outputs: sequence of outputs (word/vocabulary ids) | |
| :param rev_vocab: dict of word ids to words | |
| :return: sequence of words obtained by mapping each output to its correspond word in rev_vocab | |
| """ | |
| words = [] | |
| for output in outputs: | |
| try: | |
| words.append(rev_vocab[output]) | |
| except IndexError: | |
| print('Failed to produce word for ouput', output, ' with vocabulary size ', len(rev_vocab)) | |
| return words | |
| def words_to_ngrams(words, n=2): | |
| """ Convert a sequence of words to a sequence of n-grams | |
| :param words: sequence of words | |
| :param n: size of n-gram (i.e., 1 = unigram, 2 = bigram, etc.) | |
| :return: sequence of (overlapping) n-grams | |
| """ | |
| ngrams = ["_".join(t) for t in zip(*(words[i:] for i in range(n)))] | |
| return ngrams | |
| def get_word_error_rate(guess, gold, n=1): | |
| """ Compute the word error rate (WER), i.e., word-level Levenshtein distance, a metric used in speech recognition: | |
| https://en.wikipedia.org/wiki/Levenshtein_distance | |
| :param guess: sequence of words predicted by the model | |
| :param gold: sequence of words in the gold-standard | |
| :param n: n-gram size (default to 1) | |
| :return: n-level word error rate | |
| """ | |
| guess = words_to_ngrams(guess, n) | |
| gold = words_to_ngrams(gold, n) | |
| import editdistance | |
| return editdistance.eval(guess, gold) / len(gold) | |
| def get_bleu(guess, gold, n=2): | |
| """ Compute the BLEU metric, a machine translation analogue for precision: | |
| https://en.wikipedia.org/wiki/BLEU | |
| :param guess: sequence of words predicted by the model | |
| :param gold: sequence of words in the gold-standard | |
| :param n: n-gram size (default to 1) | |
| :return: BLEU-n score | |
| """ | |
| guess = words_to_ngrams(guess, n) | |
| gold = words_to_ngrams(gold, n) | |
| f_guess = Counter(guess) | |
| f_gold = Counter(gold) | |
| bleu_sum = 0.0 | |
| for ngram in iterkeys(f_guess): | |
| bleu_sum += min(f_guess[ngram], f_gold[ngram]) | |
| if len(f_guess.keys()) == 0: | |
| return 0 | |
| else: | |
| return np.float32(bleu_sum) / np.float32(len(guess)) | |
| def get_rogue(guess, gold, n=2): | |
| """ Compute the ROGUE metric, a machine translation analogue for recall: | |
| https://en.wikipedia.org/wiki/ROUGE_(metric) | |
| :param guess: sequence of words predicted by the model | |
| :param gold: sequence of words in the gold-standard | |
| :param n: n-gram size (default to 1) | |
| :return: ROGUE-n score | |
| """ | |
| guess = words_to_ngrams(guess, n) | |
| gold = words_to_ngrams(gold, n) | |
| f_guess = Counter(guess) | |
| f_gold = Counter(gold) | |
| rouge_sum = 0.0 | |
| for ngram in iterkeys(f_gold): | |
| rouge_sum += min(f_guess[ngram], f_gold[ngram]) | |
| if len(f_gold.keys()) == 0: | |
| return 0 | |
| else: | |
| return np.float32(rouge_sum) / np.float32(len(gold)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment