Skip to content

Instantly share code, notes, and snippets.

@mrityunjay-tripathi
Last active June 24, 2020 17:03
Show Gist options
  • Save mrityunjay-tripathi/18fa5f09e02b422c7347c6462874a096 to your computer and use it in GitHub Desktop.
Save mrityunjay-tripathi/18fa5f09e02b422c7347c6462874a096 to your computer and use it in GitHub Desktop.
Bleu score calculation in c++
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <limits>
#include <math.h>
#include <numeric>
using namespace std;
typedef typename std::vector<std::string> WordVector;
typedef long int LengthType;
template <typename VectorType = WordVector>
void printVector(VectorType v)
{
std::cout << "(";
for (size_t i = 0; i < v.size() - 1; ++i)
{
std::cout << v[i] << ", ";
}
std::cout << v[v.size() - 1] << ")";
}
std::map<WordVector, LengthType> getNGrams(const WordVector& segment, const size_t maxOrder)
{
std::map<WordVector, LengthType> ngramsCount;
for (size_t order = 1; order < maxOrder + 1; ++order)
{
for (LengthType i = 0; i < segment.size() - order + 1; ++i)
{
WordVector seq = WordVector(segment.begin() + i, segment.begin() + i + order);
ngramsCount[seq]++;
}
}
return ngramsCount;
}
struct BLEUMetric
{
float bleuScore;
float brevityPenalty;
float ratio;
LengthType translationLength;
LengthType referenceLength;
std::vector<float> precisions;
};
template <typename ReferenceCorpusType = std::vector<std::vector<WordVector>>,
typename TranslationCorpusType = std::vector<WordVector>>
struct BLEUMetric
computeBLEU(ReferenceCorpusType& referenceCorpus,
TranslationCorpusType& translationCorpus,
size_t maxOrder = 4,
bool smooth = false)
{
std::vector<LengthType> matchesByOrder(maxOrder, 0);
std::vector<LengthType> possibleMatchesByOrder(maxOrder, 0);
LengthType referenceLength = 0, translationLength = 0;
auto refIt = referenceCorpus.cbegin();
auto trIt = translationCorpus.cbegin();
for(; refIt != referenceCorpus.cend(), trIt != translationCorpus.cend(); ++refIt, ++trIt)
{
LengthType min = std::numeric_limits<LengthType>::max();
for (auto t: *refIt)
{
if (min > t.size())
{
min = t.size();
}
}
referenceLength += min;
translationLength += std::size(*trIt);
std::map<WordVector, LengthType> mergedRefNGramCounts;
for (auto t: *refIt)
{
const std::map<WordVector, LengthType> ngrams = getNGrams(t, maxOrder);
for (auto it = ngrams.cbegin(); it != ngrams.cend(); ++it)
{
if (!mergedRefNGramCounts[it->first])
mergedRefNGramCounts[it->first] = it->second;
else
mergedRefNGramCounts[it->first] = std::max(mergedRefNGramCounts[it->first], it->second);
}
}
std::map<WordVector, LengthType> translationNGramCounts = getNGrams(*trIt, maxOrder);
std::map<WordVector, LengthType> overlap;
for (auto it = mergedRefNGramCounts.cbegin(); it != mergedRefNGramCounts.cend(); ++it)
{
if (translationNGramCounts[it->first])
overlap[it->first] = std::min(translationNGramCounts[it->first], it->second);
}
for (auto it = overlap.cbegin(); it != overlap.cend(); ++it)
{
matchesByOrder[it->first.size() - 1] += it->second;
}
for (size_t order = 1; order < maxOrder + 1; ++order)
{
LengthType possibleMatches = std::size(*trIt) - order + 1;
if (possibleMatches > 0)
{
possibleMatchesByOrder[order - 1] += possibleMatches;
}
}
}
struct BLEUMetric bleu;
std::vector<float> precisions(maxOrder, 0.0);
float minPrecision = std::numeric_limits<float>::max();
for (size_t i = 0; i < maxOrder; ++i)
{
if (smooth)
precisions[i] = (matchesByOrder[i] + 1.0) / (possibleMatchesByOrder[i] + 1.0);
else
{
if (possibleMatchesByOrder[i] > 0.0)
{
precisions[i] = float(matchesByOrder[i]) / possibleMatchesByOrder[i];
}
else
precisions[i] = 0.0;
}
if (minPrecision > precisions[i])
minPrecision = precisions[i];
}
float geoMean;
if (minPrecision > 0)
{
float pLogSum = std::accumulate(precisions.begin(), precisions.end(), 0.0,
[maxOrder](float x, float precision){
return x + (1.0 / maxOrder) * std::log(precision); });
geoMean = std::exp(pLogSum);
}
bleu.ratio = float(translationLength) / referenceLength;
if (bleu.ratio > 1.0)
bleu.brevityPenalty = 1.0;
else
bleu.brevityPenalty = std::exp(1.0 - 1.0 / bleu.ratio);
bleu.bleuScore = geoMean * bleu.brevityPenalty;
bleu.translationLength = translationLength;
bleu.referenceLength = referenceLength;
bleu.precisions = precisions;
return bleu;
}
int main()
{
std::vector<std::vector<WordVector>> referenceCorpus
= {{{"this", "is", "my", "house"},
{"this", "is", "my", "car"},
{"this", "is", "my", "bike"}},
{{"this", "is", "my", "table"},
{"this", "is", "my", "chair"},
{"this", "is", "my", "laptop"}},
{{"this", "is", "my", "table"},
{"this", "is", "your", "car"},
{"this", "is", "my", "notebook"}}};
std::vector<WordVector> translationCorpus
= {{"this", "is", "my", "book"},
{"this", "is", "your", "car"},
{"this", "is", "my", "watch"}};
struct BLEUMetric bleu = computeBLEU(referenceCorpus, translationCorpus);
std::cout << "BLEU Score: " << bleu.bleuScore << std::endl;
std::cout << "Brevity Penalty: " << bleu.brevityPenalty << std::endl;
std::cout << "Ratio: " << bleu.ratio << std::endl;
std::cout << "Translation Length: " << bleu.translationLength << std::endl;
std::cout << "Reference Length: " << bleu.referenceLength << std::endl;
std::cout << "Precisions: ";
printVector<std::vector<float>>(bleu.precisions);
std::cout << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment