Skip to content

Instantly share code, notes, and snippets.

@shikharbhardwaj
Last active June 9, 2017 15:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shikharbhardwaj/238e0b152d7a575c9d2f6494208d87b1 to your computer and use it in GitHub Desktop.
Save shikharbhardwaj/238e0b152d7a575c9d2f6494208d87b1 to your computer and use it in GitHub Desktop.
Implementation of the parallel naive bayes classifier in mlpack
/**
* @file naive_bayes_classifier_impl.hpp
* @author Parikshit Ram (pram@cc.gatech.edu)
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com)
* @author Shihao Jing (shihao.jing810@gmail.com)
*
* A Naive Bayes Classifier which parametrically estimates the distribution of
* the features. This classifier makes its predictions based on the assumption
* that the features have been sampled from a set of Gaussians with diagonal
* covariance.
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#include <mlpack/prereqs.hpp>
// In case it hasn't been included already.
#include "naive_bayes_classifier.hpp"
namespace mlpack {
namespace naive_bayes {
template<typename MatType>
NaiveBayesClassifier<MatType>::NaiveBayesClassifier(
const MatType& data,
const arma::Row<size_t>& labels,
const size_t classes,
const bool incremental) :
trainingPoints(0) // Set when we call Train().
{
const size_t dimensionality = data.n_rows;
// Perform training, after initializing the model to 0 (that is, if Train()
// won't do that for us, which it won't if we're using the incremental
// algorithm).
if (incremental)
{
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
else
{
probabilities.set_size(classes);
means.set_size(dimensionality, classes);
variances.set_size(dimensionality, classes);
}
Train(data, labels, incremental);
}
template<typename MatType>
NaiveBayesClassifier<MatType>::NaiveBayesClassifier(const size_t dimensionality,
const size_t classes) :
trainingPoints(0)
{
// Initialize model to 0.
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Train(const MatType& data,
const arma::Row<size_t>& labels,
const bool incremental)
{
// Calculate the class probabilities as well as the sample mean and variance
// for each of the features with respect to each of the labels.
if (incremental)
{
// Use incremental algorithm.
// Fist, de-normalize probabilities.
probabilities *= trainingPoints;
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
arma::vec delta = data.col(j) - means.col(label);
means.col(label) += delta / probabilities[label];
variances.col(label) += delta % (data.col(j) - means.col(label));
}
for (size_t i = 0; i < probabilities.n_elem; ++i)
{
if (probabilities[i] > 2)
variances.col(i) /= (probabilities[i] - 1);
}
}
else
{
// Set all parameters to zero
probabilities.zeros();
means.zeros();
variances.zeros();
// Don't use incremental algorithm. This is a two-pass algorithm. It is
// possible to calculate the means and variances using a faster one-pass
// algorithm but there are some precision and stability issues. If this is
// too slow, it's an option to use the faster algorithm by default and then
// have this (and the incremental algorithm) be other options.
// Calculate the means.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
means.col(label) += data.col(j);
}
// Normalize means.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] != 0.0)
means.col(i) /= probabilities[i];
// Calculate variances.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
variances.col(label) += square(data.col(j) - means.col(label));
}
// Normalize variances.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] > 1)
variances.col(i) /= (probabilities[i] - 1);
}
// Ensure that the variances are invertible.
for (size_t i = 0; i < variances.n_elem; ++i)
if (variances[i] == 0.0)
variances[i] = 1e-50;
probabilities /= data.n_cols;
trainingPoints += data.n_cols;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::Train(const VecType& point,
const size_t label)
{
// We must use the incremental algorithm here.
probabilities *= trainingPoints;
probabilities[label]++;
arma::vec delta = point - means.col(label);
means.col(label) += delta / probabilities[label];
if (probabilities[label] > 2)
variances.col(label) *= (probabilities[label] - 2);
variances.col(label) += (delta % (point - means.col(label)));
if (probabilities[label] > 1)
variances.col(label) /= probabilities[label] - 1;
trainingPoints++;
probabilities /= trainingPoints;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::LogLikelihood(
const VecType& point,
arma::vec& logLikelihoods) const
{
// Check that the number of features in the test data is same as in the
// training data.
Log::Assert(point.n_rows == means.n_rows);
logLikelihoods = arma::log(probabilities);
arma::mat invVar = 1.0 / variances;
// Calculate the joint log likelihood of point for each of the
// means.n_cols.
// Loop over every class.
for (size_t i = 0; i < means.n_cols; i++)
{
// This is an adaptation of gmm::phi() for the case where the covariance is
// a diagonal matrix.
arma::vec diffs = point - means.col(i);
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value
// Calculate point log likelihood as sum of logs to decrease floating point
// errors.
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 *
log(arma::det(arma::diagmat(variances.col(i)))) + exponent);
}
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::LogLikelihood(
const MatType& data,
arma::mat& logLikelihoods) const
{
logLikelihoods.set_size(means.n_cols, data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::vec v = logLikelihoods.unsafe_col(i);
LogLikelihood(data.col(i), v);
}
}
template<typename MatType>
template<typename VecType>
size_t NaiveBayesClassifier<MatType>::Classify(const VecType& point) const
{
// Find the label(class) with max log likelihood.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
return maxIndex;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::Classify(const VecType& point,
size_t& prediction,
arma::vec& probabilities) const
{
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X));
// But LogLikelihood() gives us the unnormalized log likelihood which is
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization
// term.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)).
logLikelihoods -= logProbX;
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
prediction = (size_t) maxIndex;
probabilities = exp(logLikelihoods); // log(exp(value)) == value.
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex = 0;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions,
arma::mat& predictionProbs) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point.
for (size_t j = 0; j < data.n_cols; ++j)
{
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j))));
logLikelihoods.col(j) -= logProbX(j);
}
predictionProbs = arma::exp(logLikelihoods);
// Now calculate maximum probabilities for each point.
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
template<typename Archive>
void NaiveBayesClassifier<MatType>::Serialize(Archive& ar,
const unsigned int /* version */)
{
ar & data::CreateNVP(means, "means");
ar & data::CreateNVP(variances, "variances");
ar & data::CreateNVP(probabilities, "probabilities");
}
} // namespace naive_bayes
} // namespace mlpack
#endif
/**
* @file naive_bayes_classifier_impl.hpp
* @author Parikshit Ram (pram@cc.gatech.edu)
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com)
* @author Shihao Jing (shihao.jing810@gmail.com)
*
* A Naive Bayes Classifier which parametrically estimates the distribution of
* the features. This classifier makes its predictions based on the assumption
* that the features have been sampled from a set of Gaussians with diagonal
* covariance.
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#include <mlpack/prereqs.hpp>
// In case it hasn't been included already.
#include "parallel_naive_bayes_classifier.hpp"
namespace mlpack {
namespace naive_bayes {
template<typename MatType>
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier(
const MatType& data,
const arma::Row<size_t>& labels,
const size_t classes,
const bool incremental) :
trainingPoints(0) // Set when we call Train().
{
const size_t dimensionality = data.n_rows;
// Perform training, after initializing the model to 0 (that is, if Train()
// won't do that for us, which it won't if we're using the incremental
// algorithm).
if (incremental)
{
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
else
{
probabilities.set_size(classes);
means.set_size(dimensionality, classes);
variances.set_size(dimensionality, classes);
}
Train(data, labels, incremental);
}
template<typename MatType>
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier(const size_t dimensionality,
const size_t classes) :
trainingPoints(0)
{
// Initialize model to 0.
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Train(const MatType& data,
const arma::Row<size_t>& labels,
const bool incremental)
{
// Calculate the class probabilities as well as the sample mean and variance
// for each of the features with respect to each of the labels.
if (incremental)
{
// Use incremental algorithm.
// Fist, de-normalize probabilities.
probabilities *= trainingPoints;
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
arma::vec delta = data.col(j) - means.col(label);
means.col(label) += delta / probabilities[label];
variances.col(label) += delta % (data.col(j) - means.col(label));
}
for (size_t i = 0; i < probabilities.n_elem; ++i)
{
if (probabilities[i] > 2)
variances.col(i) /= (probabilities[i] - 1);
}
}
else
{
// Set all parameters to zero
probabilities.zeros();
means.zeros();
variances.zeros();
// Don't use incremental algorithm. This is a two-pass algorithm. It is
// possible to calculate the means and variances using a faster one-pass
// algorithm but there are some precision and stability issues. If this is
// too slow, it's an option to use the faster algorithm by default and then
// have this (and the incremental algorithm) be other options.
// Parallel loop for passing over the data for calculating means
#pragma omp parallel for schedule(dynamic) shared(probabilities, means, data)
for (size_t label = 0; label < probabilities.n_elem; ++label)
{
for (size_t i = 0; i < data.n_cols; ++i)
{
if (label == labels[i])
{
++probabilities[label];
means.col(label) += data.col(i);
}
}
// Normalize means.
if (probabilities[label] != 0.0)
means.col(label) /= probabilities[label];
}
// Calculate variances.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
variances.col(label) += square(data.col(j) - means.col(label));
}
// Normalize variances.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] > 1)
variances.col(i) /= (probabilities[i] - 1);
}
// Ensure that the variances are invertible.
for (size_t i = 0; i < variances.n_elem; ++i)
if (variances[i] == 0.0)
variances[i] = 1e-50;
probabilities /= data.n_cols;
trainingPoints += data.n_cols;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::Train(const VecType& point,
const size_t label)
{
// We must use the incremental algorithm here.
probabilities *= trainingPoints;
probabilities[label]++;
arma::vec delta = point - means.col(label);
means.col(label) += delta / probabilities[label];
if (probabilities[label] > 2)
variances.col(label) *= (probabilities[label] - 2);
variances.col(label) += (delta % (point - means.col(label)));
if (probabilities[label] > 1)
variances.col(label) /= probabilities[label] - 1;
trainingPoints++;
probabilities /= trainingPoints;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood(
const VecType& point,
arma::vec& logLikelihoods) const
{
// Check that the number of features in the test data is same as in the
// training data.
Log::Assert(point.n_rows == means.n_rows);
logLikelihoods = arma::log(probabilities);
arma::mat invVar = 1.0 / variances;
// Calculate the joint log likelihood of point for each of the
// means.n_cols.
// Loop over every class.
for (size_t i = 0; i < means.n_cols; i++)
{
// This is an adaptation of gmm::phi() for the case where the covariance is
// a diagonal matrix.
arma::vec diffs = point - means.col(i);
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value
// Calculate point log likelihood as sum of logs to decrease floating point
// errors.
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 *
log(arma::det(arma::diagmat(variances.col(i)))) + exponent);
}
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood(
const MatType& data,
arma::mat& logLikelihoods) const
{
logLikelihoods.set_size(means.n_cols, data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::vec v = logLikelihoods.unsafe_col(i);
LogLikelihood(data.col(i), v);
}
}
template<typename MatType>
template<typename VecType>
size_t ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point) const
{
// Find the label(class) with max log likelihood.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
return maxIndex;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point,
size_t& prediction,
arma::vec& probabilities) const
{
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X));
// But LogLikelihood() gives us the unnormalized log likelihood which is
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization
// term.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)).
logLikelihoods -= logProbX;
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
prediction = (size_t) maxIndex;
probabilities = exp(logLikelihoods); // log(exp(value)) == value.
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
#pragma omp parallel for
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex = 0;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions,
arma::mat& predictionProbs) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point.
for (size_t j = 0; j < data.n_cols; ++j)
{
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j))));
logLikelihoods.col(j) -= logProbX(j);
}
predictionProbs = arma::exp(logLikelihoods);
// Now calculate maximum probabilities for each point.
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
template<typename Archive>
void ParallelNaiveBayesClassifier<MatType>::Serialize(Archive& ar,
const unsigned int /* version */)
{
ar & data::CreateNVP(means, "means");
ar & data::CreateNVP(variances, "variances");
ar & data::CreateNVP(probabilities, "probabilities");
}
} // namespace naive_bayes
} // namespace mlpack
#endif
@shikharbhardwaj
Copy link
Author

The primary regions for potential parallelisation are Lines 110-115 and Lines 254-258.

For the first loop (in Train), simple SPMD would not work, as data from a label could then be altered by two threads simultaneosly. Adding a critical section to this tight loop would essentially serialize it. We could keep split up the data to be computed according to the number of threads available and then sum up the final result. In the absence of custom reductions, this would require a bit of multi threading specific code.

@mentekid
Copy link

mentekid commented Jun 7, 2017

Lines 110-115: I agree, this is write-heavy, and therefore will have either races or locks. I wonder if we could somehow do per-label parallelization. If each thread could work on its own set of distinct labels, that would remove all locking. I can't see any way to do that though without passing over the array at least once... which defeats the purpose.

Lines 254-258: This, however, is very simple. I think you could start with this - see if you get any speedup, and if it is worth spending more brainpower on Train().

@mentekid
Copy link

mentekid commented Jun 7, 2017

Here's a thought for the first one. I am not sure if it will work in C++, I've played around a bit with it in MATLAB and it has a nice result.

Instead of iterating over the points, iterate over the labels. Then, probabilities[label] will be equal to the number of labels == label. Also, means.col(label) will be equal to the sum of the columns of data for which labels == label.

In MATLAB code:

% labels -  1-by-N row vector 
% data - d-by-N real matrix
% probabilities - 1-by-L row vector (L = number of distinct labels)
% means - d-by-L real matrix

for i = 1:L % Iterate over labels, not data
    label = labels(i);
    probabilities(i) = sum(labels == label);
    means = sum(data(:, labels ==label), 2); % "2" means sum of columns. Sum computes sum of rows by default in MATLAB.
end

In MATLAB this is much faster because of fewer iterations (which are costly in MATLAB). I doubt the vectorization by itself will give us any significant boost in C++. It may actually be slower because now we are doing one N-sized loop per label (because of labels == label), where before we only did 1 such loop. However, in this form, the loop is embarrassingly parallel.

@shikharbhardwaj
Copy link
Author

Yes, in this form the loop can be easily parallelized, but the algorithmic complexity of this method becomes more. The sequential version finishes in O(nd), whereas this would take time around O(Lnd/t). I'll implement and test this right now.

@shikharbhardwaj
Copy link
Author

I made the changes on both the places and ran the classifier for the MiniBooNE dataset benchmark written before. It doesn't use more than 1 thread, and I can't figure out why. I have exported OMP_NUM_THREADS=4 before executing the program. Here are the runs from the current NBC and the above version:

λ nbc/MiniBooNE ∴ perf stat ./parallel -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.289867
Recall : 1
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001710s
[INFO ] Classify: 0.031765s
[INFO ] Train: 0.013651s
[INFO ] loading_data: 2.725474s
[INFO ] total_time: 2.773458s

Performance counter stats for './parallel -v':

   3054.002563      task-clock (msec)         #    1.099 CPUs utilized
            48      context-switches          #    0.016 K/sec
             4      cpu-migrations            #    0.001 K/sec
         2,370      page-faults               #    0.776 K/sec

11,01,85,23,328 cycles # 3.608 GHz
25,71,23,01,197 instructions # 2.33 insn per cycle
6,00,74,52,265 branches # 1967.075 M/sec
1,43,34,142 branch-misses # 0.24% of all branches

   2.779093968 seconds time elapsed

λ nbc/MiniBooNE ∴ perf stat ./serial -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.280485
Recall : 0.999285
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001492s
[INFO ] Classify: 0.030842s
[INFO ] Train: 0.008971s
[INFO ] loading_data: 2.721527s
[INFO ] total_time: 2.763616s

Performance counter stats for './serial -v':

   2975.846853      task-clock (msec)         #    1.075 CPUs utilized
            50      context-switches          #    0.017 K/sec
             2      cpu-migrations            #    0.001 K/sec
         3,385      page-faults               #    0.001 M/sec

10,89,25,83,327 cycles # 3.660 GHz
25,82,14,31,973 instructions # 2.37 insn per cycle
6,02,90,69,939 branches # 2026.001 M/sec
1,43,14,804 branch-misses # 0.24% of all branches

   2.767538280 seconds time elapsed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment