Last active
June 24, 2020 17:03
-
-
Save mrityunjay-tripathi/18fa5f09e02b422c7347c6462874a096 to your computer and use it in GitHub Desktop.
Bleu score calculation in c++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <map> | |
#include <vector> | |
#include <limits> | |
#include <math.h> | |
#include <numeric> | |
using namespace std; | |
typedef typename std::vector<std::string> WordVector; | |
typedef long int LengthType; | |
template <typename VectorType = WordVector> | |
void printVector(VectorType v) | |
{ | |
std::cout << "("; | |
for (size_t i = 0; i < v.size() - 1; ++i) | |
{ | |
std::cout << v[i] << ", "; | |
} | |
std::cout << v[v.size() - 1] << ")"; | |
} | |
std::map<WordVector, LengthType> getNGrams(const WordVector& segment, const size_t maxOrder) | |
{ | |
std::map<WordVector, LengthType> ngramsCount; | |
for (size_t order = 1; order < maxOrder + 1; ++order) | |
{ | |
for (LengthType i = 0; i < segment.size() - order + 1; ++i) | |
{ | |
WordVector seq = WordVector(segment.begin() + i, segment.begin() + i + order); | |
ngramsCount[seq]++; | |
} | |
} | |
return ngramsCount; | |
} | |
struct BLEUMetric | |
{ | |
float bleuScore; | |
float brevityPenalty; | |
float ratio; | |
LengthType translationLength; | |
LengthType referenceLength; | |
std::vector<float> precisions; | |
}; | |
template <typename ReferenceCorpusType = std::vector<std::vector<WordVector>>, | |
typename TranslationCorpusType = std::vector<WordVector>> | |
struct BLEUMetric | |
computeBLEU(ReferenceCorpusType& referenceCorpus, | |
TranslationCorpusType& translationCorpus, | |
size_t maxOrder = 4, | |
bool smooth = false) | |
{ | |
std::vector<LengthType> matchesByOrder(maxOrder, 0); | |
std::vector<LengthType> possibleMatchesByOrder(maxOrder, 0); | |
LengthType referenceLength = 0, translationLength = 0; | |
auto refIt = referenceCorpus.cbegin(); | |
auto trIt = translationCorpus.cbegin(); | |
for(; refIt != referenceCorpus.cend(), trIt != translationCorpus.cend(); ++refIt, ++trIt) | |
{ | |
LengthType min = std::numeric_limits<LengthType>::max(); | |
for (auto t: *refIt) | |
{ | |
if (min > t.size()) | |
{ | |
min = t.size(); | |
} | |
} | |
referenceLength += min; | |
translationLength += std::size(*trIt); | |
std::map<WordVector, LengthType> mergedRefNGramCounts; | |
for (auto t: *refIt) | |
{ | |
const std::map<WordVector, LengthType> ngrams = getNGrams(t, maxOrder); | |
for (auto it = ngrams.cbegin(); it != ngrams.cend(); ++it) | |
{ | |
if (!mergedRefNGramCounts[it->first]) | |
mergedRefNGramCounts[it->first] = it->second; | |
else | |
mergedRefNGramCounts[it->first] = std::max(mergedRefNGramCounts[it->first], it->second); | |
} | |
} | |
std::map<WordVector, LengthType> translationNGramCounts = getNGrams(*trIt, maxOrder); | |
std::map<WordVector, LengthType> overlap; | |
for (auto it = mergedRefNGramCounts.cbegin(); it != mergedRefNGramCounts.cend(); ++it) | |
{ | |
if (translationNGramCounts[it->first]) | |
overlap[it->first] = std::min(translationNGramCounts[it->first], it->second); | |
} | |
for (auto it = overlap.cbegin(); it != overlap.cend(); ++it) | |
{ | |
matchesByOrder[it->first.size() - 1] += it->second; | |
} | |
for (size_t order = 1; order < maxOrder + 1; ++order) | |
{ | |
LengthType possibleMatches = std::size(*trIt) - order + 1; | |
if (possibleMatches > 0) | |
{ | |
possibleMatchesByOrder[order - 1] += possibleMatches; | |
} | |
} | |
} | |
struct BLEUMetric bleu; | |
std::vector<float> precisions(maxOrder, 0.0); | |
float minPrecision = std::numeric_limits<float>::max(); | |
for (size_t i = 0; i < maxOrder; ++i) | |
{ | |
if (smooth) | |
precisions[i] = (matchesByOrder[i] + 1.0) / (possibleMatchesByOrder[i] + 1.0); | |
else | |
{ | |
if (possibleMatchesByOrder[i] > 0.0) | |
{ | |
precisions[i] = float(matchesByOrder[i]) / possibleMatchesByOrder[i]; | |
} | |
else | |
precisions[i] = 0.0; | |
} | |
if (minPrecision > precisions[i]) | |
minPrecision = precisions[i]; | |
} | |
float geoMean; | |
if (minPrecision > 0) | |
{ | |
float pLogSum = std::accumulate(precisions.begin(), precisions.end(), 0.0, | |
[maxOrder](float x, float precision){ | |
return x + (1.0 / maxOrder) * std::log(precision); }); | |
geoMean = std::exp(pLogSum); | |
} | |
bleu.ratio = float(translationLength) / referenceLength; | |
if (bleu.ratio > 1.0) | |
bleu.brevityPenalty = 1.0; | |
else | |
bleu.brevityPenalty = std::exp(1.0 - 1.0 / bleu.ratio); | |
bleu.bleuScore = geoMean * bleu.brevityPenalty; | |
bleu.translationLength = translationLength; | |
bleu.referenceLength = referenceLength; | |
bleu.precisions = precisions; | |
return bleu; | |
} | |
int main() | |
{ | |
std::vector<std::vector<WordVector>> referenceCorpus | |
= {{{"this", "is", "my", "house"}, | |
{"this", "is", "my", "car"}, | |
{"this", "is", "my", "bike"}}, | |
{{"this", "is", "my", "table"}, | |
{"this", "is", "my", "chair"}, | |
{"this", "is", "my", "laptop"}}, | |
{{"this", "is", "my", "table"}, | |
{"this", "is", "your", "car"}, | |
{"this", "is", "my", "notebook"}}}; | |
std::vector<WordVector> translationCorpus | |
= {{"this", "is", "my", "book"}, | |
{"this", "is", "your", "car"}, | |
{"this", "is", "my", "watch"}}; | |
struct BLEUMetric bleu = computeBLEU(referenceCorpus, translationCorpus); | |
std::cout << "BLEU Score: " << bleu.bleuScore << std::endl; | |
std::cout << "Brevity Penalty: " << bleu.brevityPenalty << std::endl; | |
std::cout << "Ratio: " << bleu.ratio << std::endl; | |
std::cout << "Translation Length: " << bleu.translationLength << std::endl; | |
std::cout << "Reference Length: " << bleu.referenceLength << std::endl; | |
std::cout << "Precisions: "; | |
printVector<std::vector<float>>(bleu.precisions); | |
std::cout << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment