-
-
Save shikharbhardwaj/238e0b152d7a575c9d2f6494208d87b1 to your computer and use it in GitHub Desktop.
/** | |
* @file naive_bayes_classifier_impl.hpp | |
* @author Parikshit Ram (pram@cc.gatech.edu) | |
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com) | |
* @author Shihao Jing (shihao.jing810@gmail.com) | |
* | |
* A Naive Bayes Classifier which parametrically estimates the distribution of | |
* the features. This classifier makes its predictions based on the assumption | |
* that the features have been sampled from a set of Gaussians with diagonal | |
* covariance. | |
* | |
* mlpack is free software; you may redistribute it and/or modify it under the | |
* terms of the 3-clause BSD license. You should have received a copy of the | |
* 3-clause BSD license along with mlpack. If not, see | |
* http://www.opensource.org/licenses/BSD-3-Clause for more information. | |
*/ | |
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP | |
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP | |
#include <mlpack/prereqs.hpp> | |
// In case it hasn't been included already. | |
#include "naive_bayes_classifier.hpp" | |
namespace mlpack { | |
namespace naive_bayes { | |
template<typename MatType> | |
NaiveBayesClassifier<MatType>::NaiveBayesClassifier( | |
const MatType& data, | |
const arma::Row<size_t>& labels, | |
const size_t classes, | |
const bool incremental) : | |
trainingPoints(0) // Set when we call Train(). | |
{ | |
const size_t dimensionality = data.n_rows; | |
// Perform training, after initializing the model to 0 (that is, if Train() | |
// won't do that for us, which it won't if we're using the incremental | |
// algorithm). | |
if (incremental) | |
{ | |
probabilities.zeros(classes); | |
means.zeros(dimensionality, classes); | |
variances.zeros(dimensionality, classes); | |
} | |
else | |
{ | |
probabilities.set_size(classes); | |
means.set_size(dimensionality, classes); | |
variances.set_size(dimensionality, classes); | |
} | |
Train(data, labels, incremental); | |
} | |
template<typename MatType> | |
NaiveBayesClassifier<MatType>::NaiveBayesClassifier(const size_t dimensionality, | |
const size_t classes) : | |
trainingPoints(0) | |
{ | |
// Initialize model to 0. | |
probabilities.zeros(classes); | |
means.zeros(dimensionality, classes); | |
variances.zeros(dimensionality, classes); | |
} | |
template<typename MatType> | |
void NaiveBayesClassifier<MatType>::Train(const MatType& data, | |
const arma::Row<size_t>& labels, | |
const bool incremental) | |
{ | |
// Calculate the class probabilities as well as the sample mean and variance | |
// for each of the features with respect to each of the labels. | |
if (incremental) | |
{ | |
// Use incremental algorithm. | |
// Fist, de-normalize probabilities. | |
probabilities *= trainingPoints; | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
const size_t label = labels[j]; | |
++probabilities[label]; | |
arma::vec delta = data.col(j) - means.col(label); | |
means.col(label) += delta / probabilities[label]; | |
variances.col(label) += delta % (data.col(j) - means.col(label)); | |
} | |
for (size_t i = 0; i < probabilities.n_elem; ++i) | |
{ | |
if (probabilities[i] > 2) | |
variances.col(i) /= (probabilities[i] - 1); | |
} | |
} | |
else | |
{ | |
// Set all parameters to zero | |
probabilities.zeros(); | |
means.zeros(); | |
variances.zeros(); | |
// Don't use incremental algorithm. This is a two-pass algorithm. It is | |
// possible to calculate the means and variances using a faster one-pass | |
// algorithm but there are some precision and stability issues. If this is | |
// too slow, it's an option to use the faster algorithm by default and then | |
// have this (and the incremental algorithm) be other options. | |
// Calculate the means. | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
const size_t label = labels[j]; | |
++probabilities[label]; | |
means.col(label) += data.col(j); | |
} | |
// Normalize means. | |
for (size_t i = 0; i < probabilities.n_elem; ++i) | |
if (probabilities[i] != 0.0) | |
means.col(i) /= probabilities[i]; | |
// Calculate variances. | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
const size_t label = labels[j]; | |
variances.col(label) += square(data.col(j) - means.col(label)); | |
} | |
// Normalize variances. | |
for (size_t i = 0; i < probabilities.n_elem; ++i) | |
if (probabilities[i] > 1) | |
variances.col(i) /= (probabilities[i] - 1); | |
} | |
// Ensure that the variances are invertible. | |
for (size_t i = 0; i < variances.n_elem; ++i) | |
if (variances[i] == 0.0) | |
variances[i] = 1e-50; | |
probabilities /= data.n_cols; | |
trainingPoints += data.n_cols; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void NaiveBayesClassifier<MatType>::Train(const VecType& point, | |
const size_t label) | |
{ | |
// We must use the incremental algorithm here. | |
probabilities *= trainingPoints; | |
probabilities[label]++; | |
arma::vec delta = point - means.col(label); | |
means.col(label) += delta / probabilities[label]; | |
if (probabilities[label] > 2) | |
variances.col(label) *= (probabilities[label] - 2); | |
variances.col(label) += (delta % (point - means.col(label))); | |
if (probabilities[label] > 1) | |
variances.col(label) /= probabilities[label] - 1; | |
trainingPoints++; | |
probabilities /= trainingPoints; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void NaiveBayesClassifier<MatType>::LogLikelihood( | |
const VecType& point, | |
arma::vec& logLikelihoods) const | |
{ | |
// Check that the number of features in the test data is same as in the | |
// training data. | |
Log::Assert(point.n_rows == means.n_rows); | |
logLikelihoods = arma::log(probabilities); | |
arma::mat invVar = 1.0 / variances; | |
// Calculate the joint log likelihood of point for each of the | |
// means.n_cols. | |
// Loop over every class. | |
for (size_t i = 0; i < means.n_cols; i++) | |
{ | |
// This is an adaptation of gmm::phi() for the case where the covariance is | |
// a diagonal matrix. | |
arma::vec diffs = point - means.col(i); | |
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs; | |
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value | |
// Calculate point log likelihood as sum of logs to decrease floating point | |
// errors. | |
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 * | |
log(arma::det(arma::diagmat(variances.col(i)))) + exponent); | |
} | |
} | |
template<typename MatType> | |
void NaiveBayesClassifier<MatType>::LogLikelihood( | |
const MatType& data, | |
arma::mat& logLikelihoods) const | |
{ | |
logLikelihoods.set_size(means.n_cols, data.n_cols); | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::vec v = logLikelihoods.unsafe_col(i); | |
LogLikelihood(data.col(i), v); | |
} | |
} | |
template<typename MatType> | |
template<typename VecType> | |
size_t NaiveBayesClassifier<MatType>::Classify(const VecType& point) const | |
{ | |
// Find the label(class) with max log likelihood. | |
arma::vec logLikelihoods; | |
LogLikelihood(point, logLikelihoods); | |
arma::uword maxIndex = 0; | |
logLikelihoods.max(maxIndex); | |
return maxIndex; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void NaiveBayesClassifier<MatType>::Classify(const VecType& point, | |
size_t& prediction, | |
arma::vec& probabilities) const | |
{ | |
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X)); | |
// But LogLikelihood() gives us the unnormalized log likelihood which is | |
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization | |
// term. | |
arma::vec logLikelihoods; | |
LogLikelihood(point, logLikelihoods); | |
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)). | |
logLikelihoods -= logProbX; | |
arma::uword maxIndex = 0; | |
logLikelihoods.max(maxIndex); | |
prediction = (size_t) maxIndex; | |
probabilities = exp(logLikelihoods); // log(exp(value)) == value. | |
} | |
template<typename MatType> | |
void NaiveBayesClassifier<MatType>::Classify( | |
const MatType& data, | |
arma::Row<size_t>& predictions) const | |
{ | |
predictions.set_size(data.n_cols); | |
arma::mat logLikelihoods; | |
LogLikelihood(data, logLikelihoods); | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::uword maxIndex = 0; | |
logLikelihoods.unsafe_col(i).max(maxIndex); | |
predictions[i] = maxIndex; | |
} | |
} | |
template<typename MatType> | |
void NaiveBayesClassifier<MatType>::Classify( | |
const MatType& data, | |
arma::Row<size_t>& predictions, | |
arma::mat& predictionProbs) const | |
{ | |
predictions.set_size(data.n_cols); | |
arma::mat logLikelihoods; | |
LogLikelihood(data, logLikelihoods); | |
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point. | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j)))); | |
logLikelihoods.col(j) -= logProbX(j); | |
} | |
predictionProbs = arma::exp(logLikelihoods); | |
// Now calculate maximum probabilities for each point. | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::uword maxIndex; | |
logLikelihoods.unsafe_col(i).max(maxIndex); | |
predictions[i] = maxIndex; | |
} | |
} | |
template<typename MatType> | |
template<typename Archive> | |
void NaiveBayesClassifier<MatType>::Serialize(Archive& ar, | |
const unsigned int /* version */) | |
{ | |
ar & data::CreateNVP(means, "means"); | |
ar & data::CreateNVP(variances, "variances"); | |
ar & data::CreateNVP(probabilities, "probabilities"); | |
} | |
} // namespace naive_bayes | |
} // namespace mlpack | |
#endif |
/** | |
* @file naive_bayes_classifier_impl.hpp | |
* @author Parikshit Ram (pram@cc.gatech.edu) | |
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com) | |
* @author Shihao Jing (shihao.jing810@gmail.com) | |
* | |
* A Naive Bayes Classifier which parametrically estimates the distribution of | |
* the features. This classifier makes its predictions based on the assumption | |
* that the features have been sampled from a set of Gaussians with diagonal | |
* covariance. | |
* | |
* mlpack is free software; you may redistribute it and/or modify it under the | |
* terms of the 3-clause BSD license. You should have received a copy of the | |
* 3-clause BSD license along with mlpack. If not, see | |
* http://www.opensource.org/licenses/BSD-3-Clause for more information. | |
*/ | |
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP | |
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP | |
#include <mlpack/prereqs.hpp> | |
// In case it hasn't been included already. | |
#include "parallel_naive_bayes_classifier.hpp" | |
namespace mlpack { | |
namespace naive_bayes { | |
template<typename MatType> | |
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier( | |
const MatType& data, | |
const arma::Row<size_t>& labels, | |
const size_t classes, | |
const bool incremental) : | |
trainingPoints(0) // Set when we call Train(). | |
{ | |
const size_t dimensionality = data.n_rows; | |
// Perform training, after initializing the model to 0 (that is, if Train() | |
// won't do that for us, which it won't if we're using the incremental | |
// algorithm). | |
if (incremental) | |
{ | |
probabilities.zeros(classes); | |
means.zeros(dimensionality, classes); | |
variances.zeros(dimensionality, classes); | |
} | |
else | |
{ | |
probabilities.set_size(classes); | |
means.set_size(dimensionality, classes); | |
variances.set_size(dimensionality, classes); | |
} | |
Train(data, labels, incremental); | |
} | |
template<typename MatType> | |
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier(const size_t dimensionality, | |
const size_t classes) : | |
trainingPoints(0) | |
{ | |
// Initialize model to 0. | |
probabilities.zeros(classes); | |
means.zeros(dimensionality, classes); | |
variances.zeros(dimensionality, classes); | |
} | |
template<typename MatType> | |
void ParallelNaiveBayesClassifier<MatType>::Train(const MatType& data, | |
const arma::Row<size_t>& labels, | |
const bool incremental) | |
{ | |
// Calculate the class probabilities as well as the sample mean and variance | |
// for each of the features with respect to each of the labels. | |
if (incremental) | |
{ | |
// Use incremental algorithm. | |
// Fist, de-normalize probabilities. | |
probabilities *= trainingPoints; | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
const size_t label = labels[j]; | |
++probabilities[label]; | |
arma::vec delta = data.col(j) - means.col(label); | |
means.col(label) += delta / probabilities[label]; | |
variances.col(label) += delta % (data.col(j) - means.col(label)); | |
} | |
for (size_t i = 0; i < probabilities.n_elem; ++i) | |
{ | |
if (probabilities[i] > 2) | |
variances.col(i) /= (probabilities[i] - 1); | |
} | |
} | |
else | |
{ | |
// Set all parameters to zero | |
probabilities.zeros(); | |
means.zeros(); | |
variances.zeros(); | |
// Don't use incremental algorithm. This is a two-pass algorithm. It is | |
// possible to calculate the means and variances using a faster one-pass | |
// algorithm but there are some precision and stability issues. If this is | |
// too slow, it's an option to use the faster algorithm by default and then | |
// have this (and the incremental algorithm) be other options. | |
// Parallel loop for passing over the data for calculating means | |
#pragma omp parallel for schedule(dynamic) shared(probabilities, means, data) | |
for (size_t label = 0; label < probabilities.n_elem; ++label) | |
{ | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
if (label == labels[i]) | |
{ | |
++probabilities[label]; | |
means.col(label) += data.col(i); | |
} | |
} | |
// Normalize means. | |
if (probabilities[label] != 0.0) | |
means.col(label) /= probabilities[label]; | |
} | |
// Calculate variances. | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
const size_t label = labels[j]; | |
variances.col(label) += square(data.col(j) - means.col(label)); | |
} | |
// Normalize variances. | |
for (size_t i = 0; i < probabilities.n_elem; ++i) | |
if (probabilities[i] > 1) | |
variances.col(i) /= (probabilities[i] - 1); | |
} | |
// Ensure that the variances are invertible. | |
for (size_t i = 0; i < variances.n_elem; ++i) | |
if (variances[i] == 0.0) | |
variances[i] = 1e-50; | |
probabilities /= data.n_cols; | |
trainingPoints += data.n_cols; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void ParallelNaiveBayesClassifier<MatType>::Train(const VecType& point, | |
const size_t label) | |
{ | |
// We must use the incremental algorithm here. | |
probabilities *= trainingPoints; | |
probabilities[label]++; | |
arma::vec delta = point - means.col(label); | |
means.col(label) += delta / probabilities[label]; | |
if (probabilities[label] > 2) | |
variances.col(label) *= (probabilities[label] - 2); | |
variances.col(label) += (delta % (point - means.col(label))); | |
if (probabilities[label] > 1) | |
variances.col(label) /= probabilities[label] - 1; | |
trainingPoints++; | |
probabilities /= trainingPoints; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood( | |
const VecType& point, | |
arma::vec& logLikelihoods) const | |
{ | |
// Check that the number of features in the test data is same as in the | |
// training data. | |
Log::Assert(point.n_rows == means.n_rows); | |
logLikelihoods = arma::log(probabilities); | |
arma::mat invVar = 1.0 / variances; | |
// Calculate the joint log likelihood of point for each of the | |
// means.n_cols. | |
// Loop over every class. | |
for (size_t i = 0; i < means.n_cols; i++) | |
{ | |
// This is an adaptation of gmm::phi() for the case where the covariance is | |
// a diagonal matrix. | |
arma::vec diffs = point - means.col(i); | |
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs; | |
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value | |
// Calculate point log likelihood as sum of logs to decrease floating point | |
// errors. | |
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 * | |
log(arma::det(arma::diagmat(variances.col(i)))) + exponent); | |
} | |
} | |
template<typename MatType> | |
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood( | |
const MatType& data, | |
arma::mat& logLikelihoods) const | |
{ | |
logLikelihoods.set_size(means.n_cols, data.n_cols); | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::vec v = logLikelihoods.unsafe_col(i); | |
LogLikelihood(data.col(i), v); | |
} | |
} | |
template<typename MatType> | |
template<typename VecType> | |
size_t ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point) const | |
{ | |
// Find the label(class) with max log likelihood. | |
arma::vec logLikelihoods; | |
LogLikelihood(point, logLikelihoods); | |
arma::uword maxIndex = 0; | |
logLikelihoods.max(maxIndex); | |
return maxIndex; | |
} | |
template<typename MatType> | |
template<typename VecType> | |
void ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point, | |
size_t& prediction, | |
arma::vec& probabilities) const | |
{ | |
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X)); | |
// But LogLikelihood() gives us the unnormalized log likelihood which is | |
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization | |
// term. | |
arma::vec logLikelihoods; | |
LogLikelihood(point, logLikelihoods); | |
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)). | |
logLikelihoods -= logProbX; | |
arma::uword maxIndex = 0; | |
logLikelihoods.max(maxIndex); | |
prediction = (size_t) maxIndex; | |
probabilities = exp(logLikelihoods); // log(exp(value)) == value. | |
} | |
template<typename MatType> | |
void ParallelNaiveBayesClassifier<MatType>::Classify( | |
const MatType& data, | |
arma::Row<size_t>& predictions) const | |
{ | |
predictions.set_size(data.n_cols); | |
arma::mat logLikelihoods; | |
LogLikelihood(data, logLikelihoods); | |
#pragma omp parallel for | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::uword maxIndex = 0; | |
logLikelihoods.unsafe_col(i).max(maxIndex); | |
predictions[i] = maxIndex; | |
} | |
} | |
template<typename MatType> | |
void ParallelNaiveBayesClassifier<MatType>::Classify( | |
const MatType& data, | |
arma::Row<size_t>& predictions, | |
arma::mat& predictionProbs) const | |
{ | |
predictions.set_size(data.n_cols); | |
arma::mat logLikelihoods; | |
LogLikelihood(data, logLikelihoods); | |
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point. | |
for (size_t j = 0; j < data.n_cols; ++j) | |
{ | |
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j)))); | |
logLikelihoods.col(j) -= logProbX(j); | |
} | |
predictionProbs = arma::exp(logLikelihoods); | |
// Now calculate maximum probabilities for each point. | |
for (size_t i = 0; i < data.n_cols; ++i) | |
{ | |
arma::uword maxIndex; | |
logLikelihoods.unsafe_col(i).max(maxIndex); | |
predictions[i] = maxIndex; | |
} | |
} | |
template<typename MatType> | |
template<typename Archive> | |
void ParallelNaiveBayesClassifier<MatType>::Serialize(Archive& ar, | |
const unsigned int /* version */) | |
{ | |
ar & data::CreateNVP(means, "means"); | |
ar & data::CreateNVP(variances, "variances"); | |
ar & data::CreateNVP(probabilities, "probabilities"); | |
} | |
} // namespace naive_bayes | |
} // namespace mlpack | |
#endif |
Yes, in this form the loop can be easily parallelized, but the algorithmic complexity of this method becomes more. The sequential version finishes in O(nd), whereas this would take time around O(Lnd/t). I'll implement and test this right now.
I made the changes on both the places and ran the classifier for the MiniBooNE dataset benchmark written before. It doesn't use more than 1 thread, and I can't figure out why. I have exported OMP_NUM_THREADS=4 before executing the program. Here are the runs from the current NBC and the above version:
λ nbc/MiniBooNE ∴ perf stat ./parallel -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.289867
Recall : 1
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001710s
[INFO ] Classify: 0.031765s
[INFO ] Train: 0.013651s
[INFO ] loading_data: 2.725474s
[INFO ] total_time: 2.773458sPerformance counter stats for './parallel -v':
3054.002563 task-clock (msec) # 1.099 CPUs utilized 48 context-switches # 0.016 K/sec 4 cpu-migrations # 0.001 K/sec 2,370 page-faults # 0.776 K/sec
11,01,85,23,328 cycles # 3.608 GHz
25,71,23,01,197 instructions # 2.33 insn per cycle
6,00,74,52,265 branches # 1967.075 M/sec
1,43,34,142 branch-misses # 0.24% of all branches2.779093968 seconds time elapsed
λ nbc/MiniBooNE ∴ perf stat ./serial -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.280485
Recall : 0.999285
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001492s
[INFO ] Classify: 0.030842s
[INFO ] Train: 0.008971s
[INFO ] loading_data: 2.721527s
[INFO ] total_time: 2.763616sPerformance counter stats for './serial -v':
2975.846853 task-clock (msec) # 1.075 CPUs utilized 50 context-switches # 0.017 K/sec 2 cpu-migrations # 0.001 K/sec 3,385 page-faults # 0.001 M/sec
10,89,25,83,327 cycles # 3.660 GHz
25,82,14,31,973 instructions # 2.37 insn per cycle
6,02,90,69,939 branches # 2026.001 M/sec
1,43,14,804 branch-misses # 0.24% of all branches2.767538280 seconds time elapsed
Here's a thought for the first one. I am not sure if it will work in C++, I've played around a bit with it in MATLAB and it has a nice result.
Instead of iterating over the points, iterate over the labels. Then,
probabilities[label]
will be equal to the number oflabels == label
. Also,means.col(label)
will be equal to the sum of the columns of data for whichlabels == label
.In MATLAB code:
In MATLAB this is much faster because of fewer iterations (which are costly in MATLAB). I doubt the vectorization by itself will give us any significant boost in C++. It may actually be slower because now we are doing one N-sized loop per label (because of labels == label), where before we only did 1 such loop. However, in this form, the loop is embarrassingly parallel.