mrityunjay-tripathi/bleu.cpp

## bleu.cpp
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <limits>
#include <math.h>
#include <numeric>

using namespace std;
typedef typename std::vector<std::string> WordVector;
typedef long int LengthType;

template <typename VectorType = WordVector>
void printVector(VectorType v)
{
  std::cout << "(";
  for (size_t i = 0; i < v.size() - 1; ++i)
  {
    std::cout << v[i] << ", ";
  }
  std::cout << v[v.size() - 1] << ")";
}

std::map<WordVector, LengthType> getNGrams(const WordVector& segment, const size_t maxOrder)
{
  std::map<WordVector, LengthType> ngramsCount;
  for (size_t order = 1; order < maxOrder + 1; ++order)
  {
    for (LengthType i = 0; i < segment.size() - order + 1; ++i)
    {
      WordVector seq = WordVector(segment.begin() + i, segment.begin() + i + order);
      ngramsCount[seq]++;
    }
  }
  return ngramsCount;
}

struct BLEUMetric
{
  float bleuScore;
  float brevityPenalty;
  float ratio;
  LengthType translationLength;
  LengthType referenceLength;
  std::vector<float> precisions;
};

template <typename ReferenceCorpusType = std::vector<std::vector<WordVector>>,
          typename TranslationCorpusType = std::vector<WordVector>>
struct BLEUMetric
computeBLEU(ReferenceCorpusType& referenceCorpus,
            TranslationCorpusType& translationCorpus,
            size_t maxOrder = 4,
            bool smooth = false)
{
  std::vector<LengthType> matchesByOrder(maxOrder, 0);
  std::vector<LengthType> possibleMatchesByOrder(maxOrder, 0);
  LengthType referenceLength = 0, translationLength = 0;

  auto refIt = referenceCorpus.cbegin();
  auto trIt = translationCorpus.cbegin();
  for(; refIt != referenceCorpus.cend(), trIt != translationCorpus.cend(); ++refIt, ++trIt)
  {
    LengthType min = std::numeric_limits<LengthType>::max();
    for (auto t: *refIt)
    {
      if (min > t.size())
      {
        min = t.size();
      }
    }
    referenceLength += min;
    translationLength += std::size(*trIt);

    std::map<WordVector, LengthType> mergedRefNGramCounts;
    for (auto t: *refIt)
    {
      const std::map<WordVector, LengthType> ngrams = getNGrams(t, maxOrder);
      for (auto it = ngrams.cbegin(); it != ngrams.cend(); ++it)
      {
        if (!mergedRefNGramCounts[it->first])
          mergedRefNGramCounts[it->first] = it->second;
        else
          mergedRefNGramCounts[it->first] = std::max(mergedRefNGramCounts[it->first], it->second);
      }
    }

    std::map<WordVector, LengthType> translationNGramCounts = getNGrams(*trIt, maxOrder);
    std::map<WordVector, LengthType> overlap;
    for (auto it = mergedRefNGramCounts.cbegin(); it != mergedRefNGramCounts.cend(); ++it)
    {
        if (translationNGramCounts[it->first])
          overlap[it->first] = std::min(translationNGramCounts[it->first], it->second);
    }

    for (auto it = overlap.cbegin(); it != overlap.cend(); ++it)
    {
      matchesByOrder[it->first.size() - 1] += it->second;
    }

    for (size_t order = 1; order < maxOrder + 1; ++order)
    {
      LengthType possibleMatches = std::size(*trIt) - order + 1;
      if (possibleMatches > 0)
      {
        possibleMatchesByOrder[order - 1] += possibleMatches;
      }
    }
  }

  struct BLEUMetric bleu;

  std::vector<float> precisions(maxOrder, 0.0);
  float minPrecision = std::numeric_limits<float>::max();
  for (size_t i = 0; i < maxOrder; ++i)
  {
    if (smooth)
      precisions[i] = (matchesByOrder[i] + 1.0) / (possibleMatchesByOrder[i] + 1.0);
    else
    {
      if (possibleMatchesByOrder[i] > 0.0)
      {
        precisions[i] = float(matchesByOrder[i]) / possibleMatchesByOrder[i];
      }
      else
        precisions[i] = 0.0;
    }
    if (minPrecision > precisions[i])
      minPrecision = precisions[i];
  }

  float geoMean;
  if (minPrecision > 0)
  {
    float pLogSum = std::accumulate(precisions.begin(), precisions.end(), 0.0,
            [maxOrder](float x, float precision){
            return x + (1.0 / maxOrder) * std::log(precision); });
    geoMean = std::exp(pLogSum);
  }
  bleu.ratio = float(translationLength) / referenceLength;
  if (bleu.ratio > 1.0)
    bleu.brevityPenalty = 1.0;
  else
    bleu.brevityPenalty = std::exp(1.0 - 1.0 / bleu.ratio);

  bleu.bleuScore = geoMean * bleu.brevityPenalty;
  bleu.translationLength = translationLength;
  bleu.referenceLength = referenceLength;
  bleu.precisions = precisions;
  return bleu;
}

int main()
{
  std::vector<std::vector<WordVector>> referenceCorpus
      = {{{"this", "is", "my", "house"},
          {"this", "is", "my", "car"},
          {"this", "is", "my", "bike"}},

          {{"this", "is", "my", "table"},
          {"this", "is", "my", "chair"},
          {"this", "is", "my", "laptop"}},

          {{"this", "is", "my", "table"},
          {"this", "is", "your", "car"},
          {"this", "is", "my", "notebook"}}};

  std::vector<WordVector> translationCorpus
      = {{"this", "is", "my", "book"},
         {"this", "is", "your", "car"},
         {"this", "is", "my", "watch"}};
  struct BLEUMetric bleu = computeBLEU(referenceCorpus, translationCorpus);
  std::cout << "BLEU Score: " << bleu.bleuScore << std::endl;
  std::cout << "Brevity Penalty: " << bleu.brevityPenalty << std::endl;
  std::cout << "Ratio: " << bleu.ratio << std::endl;
  std::cout << "Translation Length: " << bleu.translationLength << std::endl;
  std::cout << "Reference Length: " << bleu.referenceLength << std::endl;
  std::cout << "Precisions: ";
  printVector<std::vector<float>>(bleu.precisions);
  std::cout << std::endl;
}
	#include <iostream>
	#include <string>
	#include <map>
	#include <vector>
	#include <limits>
	#include <math.h>
	#include <numeric>

	using namespace std;
	typedef typename std::vector<std::string> WordVector;
	typedef long int LengthType;

	template <typename VectorType = WordVector>
	void printVector(VectorType v)
	{
	std::cout << "(";
	for (size_t i = 0; i < v.size() - 1; ++i)
	{
	std::cout << v[i] << ", ";
	}
	std::cout << v[v.size() - 1] << ")";
	}

	std::map<WordVector, LengthType> getNGrams(const WordVector& segment, const size_t maxOrder)
	{
	std::map<WordVector, LengthType> ngramsCount;
	for (size_t order = 1; order < maxOrder + 1; ++order)
	{
	for (LengthType i = 0; i < segment.size() - order + 1; ++i)
	{
	WordVector seq = WordVector(segment.begin() + i, segment.begin() + i + order);
	ngramsCount[seq]++;
	}
	}
	return ngramsCount;
	}

	struct BLEUMetric
	{
	float bleuScore;
	float brevityPenalty;
	float ratio;
	LengthType translationLength;
	LengthType referenceLength;
	std::vector<float> precisions;
	};

	template <typename ReferenceCorpusType = std::vector<std::vector<WordVector>>,
	typename TranslationCorpusType = std::vector<WordVector>>
	struct BLEUMetric
	computeBLEU(ReferenceCorpusType& referenceCorpus,
	TranslationCorpusType& translationCorpus,
	size_t maxOrder = 4,
	bool smooth = false)
	{
	std::vector<LengthType> matchesByOrder(maxOrder, 0);
	std::vector<LengthType> possibleMatchesByOrder(maxOrder, 0);
	LengthType referenceLength = 0, translationLength = 0;

	auto refIt = referenceCorpus.cbegin();
	auto trIt = translationCorpus.cbegin();
	for(; refIt != referenceCorpus.cend(), trIt != translationCorpus.cend(); ++refIt, ++trIt)
	{
	LengthType min = std::numeric_limits<LengthType>::max();
	for (auto t: *refIt)
	{
	if (min > t.size())
	{
	min = t.size();
	}
	}
	referenceLength += min;
	translationLength += std::size(*trIt);

	std::map<WordVector, LengthType> mergedRefNGramCounts;
	for (auto t: *refIt)
	{
	const std::map<WordVector, LengthType> ngrams = getNGrams(t, maxOrder);
	for (auto it = ngrams.cbegin(); it != ngrams.cend(); ++it)
	{
	if (!mergedRefNGramCounts[it->first])
	mergedRefNGramCounts[it->first] = it->second;
	else
	mergedRefNGramCounts[it->first] = std::max(mergedRefNGramCounts[it->first], it->second);
	}
	}

	std::map<WordVector, LengthType> translationNGramCounts = getNGrams(*trIt, maxOrder);
	std::map<WordVector, LengthType> overlap;
	for (auto it = mergedRefNGramCounts.cbegin(); it != mergedRefNGramCounts.cend(); ++it)
	{
	if (translationNGramCounts[it->first])
	overlap[it->first] = std::min(translationNGramCounts[it->first], it->second);
	}

	for (auto it = overlap.cbegin(); it != overlap.cend(); ++it)
	{
	matchesByOrder[it->first.size() - 1] += it->second;
	}

	for (size_t order = 1; order < maxOrder + 1; ++order)
	{
	LengthType possibleMatches = std::size(*trIt) - order + 1;
	if (possibleMatches > 0)
	{
	possibleMatchesByOrder[order - 1] += possibleMatches;
	}
	}
	}

	struct BLEUMetric bleu;

	std::vector<float> precisions(maxOrder, 0.0);
	float minPrecision = std::numeric_limits<float>::max();
	for (size_t i = 0; i < maxOrder; ++i)
	{
	if (smooth)
	precisions[i] = (matchesByOrder[i] + 1.0) / (possibleMatchesByOrder[i] + 1.0);
	else
	{
	if (possibleMatchesByOrder[i] > 0.0)
	{
	precisions[i] = float(matchesByOrder[i]) / possibleMatchesByOrder[i];
	}
	else
	precisions[i] = 0.0;
	}
	if (minPrecision > precisions[i])
	minPrecision = precisions[i];
	}

	float geoMean;
	if (minPrecision > 0)
	{
	float pLogSum = std::accumulate(precisions.begin(), precisions.end(), 0.0,
	[maxOrder](float x, float precision){
	return x + (1.0 / maxOrder) * std::log(precision); });
	geoMean = std::exp(pLogSum);
	}
	bleu.ratio = float(translationLength) / referenceLength;
	if (bleu.ratio > 1.0)
	bleu.brevityPenalty = 1.0;
	else
	bleu.brevityPenalty = std::exp(1.0 - 1.0 / bleu.ratio);

	bleu.bleuScore = geoMean * bleu.brevityPenalty;
	bleu.translationLength = translationLength;
	bleu.referenceLength = referenceLength;
	bleu.precisions = precisions;
	return bleu;
	}

	int main()
	{
	std::vector<std::vector<WordVector>> referenceCorpus
	= {{{"this", "is", "my", "house"},
	{"this", "is", "my", "car"},
	{"this", "is", "my", "bike"}},

	{{"this", "is", "my", "table"},
	{"this", "is", "my", "chair"},
	{"this", "is", "my", "laptop"}},

	{{"this", "is", "my", "table"},
	{"this", "is", "your", "car"},
	{"this", "is", "my", "notebook"}}};

	std::vector<WordVector> translationCorpus
	= {{"this", "is", "my", "book"},
	{"this", "is", "your", "car"},
	{"this", "is", "my", "watch"}};
	struct BLEUMetric bleu = computeBLEU(referenceCorpus, translationCorpus);
	std::cout << "BLEU Score: " << bleu.bleuScore << std::endl;
	std::cout << "Brevity Penalty: " << bleu.brevityPenalty << std::endl;
	std::cout << "Ratio: " << bleu.ratio << std::endl;
	std::cout << "Translation Length: " << bleu.translationLength << std::endl;
	std::cout << "Reference Length: " << bleu.referenceLength << std::endl;
	std::cout << "Precisions: ";
	printVector<std::vector<float>>(bleu.precisions);
	std::cout << std::endl;
	}