Skip to content

Instantly share code, notes, and snippets.

@h4ste
Created December 6, 2018 19:47
Show Gist options
  • Select an option

  • Save h4ste/eb0865caab260472272460269eaae137 to your computer and use it in GitHub Desktop.

Select an option

Save h4ste/eb0865caab260472272460269eaae137 to your computer and use it in GitHub Desktop.
Language Utilities from DSRM project
from __future__ import division
from collections import Counter
import numpy as np
from six import iterkeys
def truncate_outputs(outputs, eos_id):
""" Truncate a sequence of outputs (vocabulary ids) to end before the end-of-sequence id (eos_id)
:param outputs: sequence of outputs (vocabulary/word ids)
:param eos_id: identifier for end-of-sequence (eos) symbol
:return: truncated sequence of outputs
"""
outputs = np.asarray(outputs)
if eos_id in outputs:
end_index = np.where(outputs == eos_id)
outputs = outputs[:end_index[0][0] + 1]
return outputs
def logits_to_outputs(output_logits, eos_id):
"""Convert sequence of logits (e.g., softmax activations) to sequence of outputs (word ids)
:param output_logits: sequence of output logits (neural network activations or weights)
:return: sequence of word ids, truncated to end before the eos_id
"""
outputs = [int(np.argmax(logits)) for logits in output_logits]
return truncate_outputs(outputs, eos_id)
def outputs_to_words(outputs, rev_vocab):
""" Convert a sequence of outputs (word ids) to words
:param outputs: sequence of outputs (word/vocabulary ids)
:param rev_vocab: dict of word ids to words
:return: sequence of words obtained by mapping each output to its correspond word in rev_vocab
"""
words = []
for output in outputs:
try:
words.append(rev_vocab[output])
except IndexError:
print('Failed to produce word for ouput', output, ' with vocabulary size ', len(rev_vocab))
return words
def words_to_ngrams(words, n=2):
""" Convert a sequence of words to a sequence of n-grams
:param words: sequence of words
:param n: size of n-gram (i.e., 1 = unigram, 2 = bigram, etc.)
:return: sequence of (overlapping) n-grams
"""
ngrams = ["_".join(t) for t in zip(*(words[i:] for i in range(n)))]
return ngrams
def get_word_error_rate(guess, gold, n=1):
""" Compute the word error rate (WER), i.e., word-level Levenshtein distance, a metric used in speech recognition:
https://en.wikipedia.org/wiki/Levenshtein_distance
:param guess: sequence of words predicted by the model
:param gold: sequence of words in the gold-standard
:param n: n-gram size (default to 1)
:return: n-level word error rate
"""
guess = words_to_ngrams(guess, n)
gold = words_to_ngrams(gold, n)
import editdistance
return editdistance.eval(guess, gold) / len(gold)
def get_bleu(guess, gold, n=2):
""" Compute the BLEU metric, a machine translation analogue for precision:
https://en.wikipedia.org/wiki/BLEU
:param guess: sequence of words predicted by the model
:param gold: sequence of words in the gold-standard
:param n: n-gram size (default to 1)
:return: BLEU-n score
"""
guess = words_to_ngrams(guess, n)
gold = words_to_ngrams(gold, n)
f_guess = Counter(guess)
f_gold = Counter(gold)
bleu_sum = 0.0
for ngram in iterkeys(f_guess):
bleu_sum += min(f_guess[ngram], f_gold[ngram])
if len(f_guess.keys()) == 0:
return 0
else:
return np.float32(bleu_sum) / np.float32(len(guess))
def get_rogue(guess, gold, n=2):
""" Compute the ROGUE metric, a machine translation analogue for recall:
https://en.wikipedia.org/wiki/ROUGE_(metric)
:param guess: sequence of words predicted by the model
:param gold: sequence of words in the gold-standard
:param n: n-gram size (default to 1)
:return: ROGUE-n score
"""
guess = words_to_ngrams(guess, n)
gold = words_to_ngrams(gold, n)
f_guess = Counter(guess)
f_gold = Counter(gold)
rouge_sum = 0.0
for ngram in iterkeys(f_gold):
rouge_sum += min(f_guess[ngram], f_gold[ngram])
if len(f_gold.keys()) == 0:
return 0
else:
return np.float32(rouge_sum) / np.float32(len(gold))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment