Skip to content

Instantly share code, notes, and snippets.

@shikharbhardwaj
Last active June 9, 2017 15:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shikharbhardwaj/238e0b152d7a575c9d2f6494208d87b1 to your computer and use it in GitHub Desktop.
Save shikharbhardwaj/238e0b152d7a575c9d2f6494208d87b1 to your computer and use it in GitHub Desktop.
Implementation of the parallel naive bayes classifier in mlpack
/**
* @file naive_bayes_classifier_impl.hpp
* @author Parikshit Ram (pram@cc.gatech.edu)
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com)
* @author Shihao Jing (shihao.jing810@gmail.com)
*
* A Naive Bayes Classifier which parametrically estimates the distribution of
* the features. This classifier makes its predictions based on the assumption
* that the features have been sampled from a set of Gaussians with diagonal
* covariance.
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#include <mlpack/prereqs.hpp>
// In case it hasn't been included already.
#include "naive_bayes_classifier.hpp"
namespace mlpack {
namespace naive_bayes {
template<typename MatType>
NaiveBayesClassifier<MatType>::NaiveBayesClassifier(
const MatType& data,
const arma::Row<size_t>& labels,
const size_t classes,
const bool incremental) :
trainingPoints(0) // Set when we call Train().
{
const size_t dimensionality = data.n_rows;
// Perform training, after initializing the model to 0 (that is, if Train()
// won't do that for us, which it won't if we're using the incremental
// algorithm).
if (incremental)
{
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
else
{
probabilities.set_size(classes);
means.set_size(dimensionality, classes);
variances.set_size(dimensionality, classes);
}
Train(data, labels, incremental);
}
template<typename MatType>
NaiveBayesClassifier<MatType>::NaiveBayesClassifier(const size_t dimensionality,
const size_t classes) :
trainingPoints(0)
{
// Initialize model to 0.
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Train(const MatType& data,
const arma::Row<size_t>& labels,
const bool incremental)
{
// Calculate the class probabilities as well as the sample mean and variance
// for each of the features with respect to each of the labels.
if (incremental)
{
// Use incremental algorithm.
// Fist, de-normalize probabilities.
probabilities *= trainingPoints;
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
arma::vec delta = data.col(j) - means.col(label);
means.col(label) += delta / probabilities[label];
variances.col(label) += delta % (data.col(j) - means.col(label));
}
for (size_t i = 0; i < probabilities.n_elem; ++i)
{
if (probabilities[i] > 2)
variances.col(i) /= (probabilities[i] - 1);
}
}
else
{
// Set all parameters to zero
probabilities.zeros();
means.zeros();
variances.zeros();
// Don't use incremental algorithm. This is a two-pass algorithm. It is
// possible to calculate the means and variances using a faster one-pass
// algorithm but there are some precision and stability issues. If this is
// too slow, it's an option to use the faster algorithm by default and then
// have this (and the incremental algorithm) be other options.
// Calculate the means.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
means.col(label) += data.col(j);
}
// Normalize means.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] != 0.0)
means.col(i) /= probabilities[i];
// Calculate variances.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
variances.col(label) += square(data.col(j) - means.col(label));
}
// Normalize variances.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] > 1)
variances.col(i) /= (probabilities[i] - 1);
}
// Ensure that the variances are invertible.
for (size_t i = 0; i < variances.n_elem; ++i)
if (variances[i] == 0.0)
variances[i] = 1e-50;
probabilities /= data.n_cols;
trainingPoints += data.n_cols;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::Train(const VecType& point,
const size_t label)
{
// We must use the incremental algorithm here.
probabilities *= trainingPoints;
probabilities[label]++;
arma::vec delta = point - means.col(label);
means.col(label) += delta / probabilities[label];
if (probabilities[label] > 2)
variances.col(label) *= (probabilities[label] - 2);
variances.col(label) += (delta % (point - means.col(label)));
if (probabilities[label] > 1)
variances.col(label) /= probabilities[label] - 1;
trainingPoints++;
probabilities /= trainingPoints;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::LogLikelihood(
const VecType& point,
arma::vec& logLikelihoods) const
{
// Check that the number of features in the test data is same as in the
// training data.
Log::Assert(point.n_rows == means.n_rows);
logLikelihoods = arma::log(probabilities);
arma::mat invVar = 1.0 / variances;
// Calculate the joint log likelihood of point for each of the
// means.n_cols.
// Loop over every class.
for (size_t i = 0; i < means.n_cols; i++)
{
// This is an adaptation of gmm::phi() for the case where the covariance is
// a diagonal matrix.
arma::vec diffs = point - means.col(i);
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value
// Calculate point log likelihood as sum of logs to decrease floating point
// errors.
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 *
log(arma::det(arma::diagmat(variances.col(i)))) + exponent);
}
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::LogLikelihood(
const MatType& data,
arma::mat& logLikelihoods) const
{
logLikelihoods.set_size(means.n_cols, data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::vec v = logLikelihoods.unsafe_col(i);
LogLikelihood(data.col(i), v);
}
}
template<typename MatType>
template<typename VecType>
size_t NaiveBayesClassifier<MatType>::Classify(const VecType& point) const
{
// Find the label(class) with max log likelihood.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
return maxIndex;
}
template<typename MatType>
template<typename VecType>
void NaiveBayesClassifier<MatType>::Classify(const VecType& point,
size_t& prediction,
arma::vec& probabilities) const
{
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X));
// But LogLikelihood() gives us the unnormalized log likelihood which is
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization
// term.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)).
logLikelihoods -= logProbX;
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
prediction = (size_t) maxIndex;
probabilities = exp(logLikelihoods); // log(exp(value)) == value.
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex = 0;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
void NaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions,
arma::mat& predictionProbs) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point.
for (size_t j = 0; j < data.n_cols; ++j)
{
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j))));
logLikelihoods.col(j) -= logProbX(j);
}
predictionProbs = arma::exp(logLikelihoods);
// Now calculate maximum probabilities for each point.
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
template<typename Archive>
void NaiveBayesClassifier<MatType>::Serialize(Archive& ar,
const unsigned int /* version */)
{
ar & data::CreateNVP(means, "means");
ar & data::CreateNVP(variances, "variances");
ar & data::CreateNVP(probabilities, "probabilities");
}
} // namespace naive_bayes
} // namespace mlpack
#endif
/**
* @file naive_bayes_classifier_impl.hpp
* @author Parikshit Ram (pram@cc.gatech.edu)
* @author Vahab Akbarzadeh (v.akbarzadeh@gmail.com)
* @author Shihao Jing (shihao.jing810@gmail.com)
*
* A Naive Bayes Classifier which parametrically estimates the distribution of
* the features. This classifier makes its predictions based on the assumption
* that the features have been sampled from a set of Gaussians with diagonal
* covariance.
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/
#ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
#include <mlpack/prereqs.hpp>
// In case it hasn't been included already.
#include "parallel_naive_bayes_classifier.hpp"
namespace mlpack {
namespace naive_bayes {
template<typename MatType>
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier(
const MatType& data,
const arma::Row<size_t>& labels,
const size_t classes,
const bool incremental) :
trainingPoints(0) // Set when we call Train().
{
const size_t dimensionality = data.n_rows;
// Perform training, after initializing the model to 0 (that is, if Train()
// won't do that for us, which it won't if we're using the incremental
// algorithm).
if (incremental)
{
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
else
{
probabilities.set_size(classes);
means.set_size(dimensionality, classes);
variances.set_size(dimensionality, classes);
}
Train(data, labels, incremental);
}
template<typename MatType>
ParallelNaiveBayesClassifier<MatType>::ParallelNaiveBayesClassifier(const size_t dimensionality,
const size_t classes) :
trainingPoints(0)
{
// Initialize model to 0.
probabilities.zeros(classes);
means.zeros(dimensionality, classes);
variances.zeros(dimensionality, classes);
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Train(const MatType& data,
const arma::Row<size_t>& labels,
const bool incremental)
{
// Calculate the class probabilities as well as the sample mean and variance
// for each of the features with respect to each of the labels.
if (incremental)
{
// Use incremental algorithm.
// Fist, de-normalize probabilities.
probabilities *= trainingPoints;
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
++probabilities[label];
arma::vec delta = data.col(j) - means.col(label);
means.col(label) += delta / probabilities[label];
variances.col(label) += delta % (data.col(j) - means.col(label));
}
for (size_t i = 0; i < probabilities.n_elem; ++i)
{
if (probabilities[i] > 2)
variances.col(i) /= (probabilities[i] - 1);
}
}
else
{
// Set all parameters to zero
probabilities.zeros();
means.zeros();
variances.zeros();
// Don't use incremental algorithm. This is a two-pass algorithm. It is
// possible to calculate the means and variances using a faster one-pass
// algorithm but there are some precision and stability issues. If this is
// too slow, it's an option to use the faster algorithm by default and then
// have this (and the incremental algorithm) be other options.
// Parallel loop for passing over the data for calculating means
#pragma omp parallel for schedule(dynamic) shared(probabilities, means, data)
for (size_t label = 0; label < probabilities.n_elem; ++label)
{
for (size_t i = 0; i < data.n_cols; ++i)
{
if (label == labels[i])
{
++probabilities[label];
means.col(label) += data.col(i);
}
}
// Normalize means.
if (probabilities[label] != 0.0)
means.col(label) /= probabilities[label];
}
// Calculate variances.
for (size_t j = 0; j < data.n_cols; ++j)
{
const size_t label = labels[j];
variances.col(label) += square(data.col(j) - means.col(label));
}
// Normalize variances.
for (size_t i = 0; i < probabilities.n_elem; ++i)
if (probabilities[i] > 1)
variances.col(i) /= (probabilities[i] - 1);
}
// Ensure that the variances are invertible.
for (size_t i = 0; i < variances.n_elem; ++i)
if (variances[i] == 0.0)
variances[i] = 1e-50;
probabilities /= data.n_cols;
trainingPoints += data.n_cols;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::Train(const VecType& point,
const size_t label)
{
// We must use the incremental algorithm here.
probabilities *= trainingPoints;
probabilities[label]++;
arma::vec delta = point - means.col(label);
means.col(label) += delta / probabilities[label];
if (probabilities[label] > 2)
variances.col(label) *= (probabilities[label] - 2);
variances.col(label) += (delta % (point - means.col(label)));
if (probabilities[label] > 1)
variances.col(label) /= probabilities[label] - 1;
trainingPoints++;
probabilities /= trainingPoints;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood(
const VecType& point,
arma::vec& logLikelihoods) const
{
// Check that the number of features in the test data is same as in the
// training data.
Log::Assert(point.n_rows == means.n_rows);
logLikelihoods = arma::log(probabilities);
arma::mat invVar = 1.0 / variances;
// Calculate the joint log likelihood of point for each of the
// means.n_cols.
// Loop over every class.
for (size_t i = 0; i < means.n_cols; i++)
{
// This is an adaptation of gmm::phi() for the case where the covariance is
// a diagonal matrix.
arma::vec diffs = point - means.col(i);
arma::vec rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
double exponent = arma::accu(diffs % rhs); // log(exp(value)) == value
// Calculate point log likelihood as sum of logs to decrease floating point
// errors.
logLikelihoods(i) += (point.n_rows / -2.0 * log(2 * M_PI) - 0.5 *
log(arma::det(arma::diagmat(variances.col(i)))) + exponent);
}
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::LogLikelihood(
const MatType& data,
arma::mat& logLikelihoods) const
{
logLikelihoods.set_size(means.n_cols, data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::vec v = logLikelihoods.unsafe_col(i);
LogLikelihood(data.col(i), v);
}
}
template<typename MatType>
template<typename VecType>
size_t ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point) const
{
// Find the label(class) with max log likelihood.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
return maxIndex;
}
template<typename MatType>
template<typename VecType>
void ParallelNaiveBayesClassifier<MatType>::Classify(const VecType& point,
size_t& prediction,
arma::vec& probabilities) const
{
// log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X));
// But LogLikelihood() gives us the unnormalized log likelihood which is
// Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization
// term.
arma::vec logLikelihoods;
LogLikelihood(point, logLikelihoods);
const double logProbX = log(arma::accu(exp(logLikelihoods))); // Log(Prob(X)).
logLikelihoods -= logProbX;
arma::uword maxIndex = 0;
logLikelihoods.max(maxIndex);
prediction = (size_t) maxIndex;
probabilities = exp(logLikelihoods); // log(exp(value)) == value.
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
#pragma omp parallel for
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex = 0;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
void ParallelNaiveBayesClassifier<MatType>::Classify(
const MatType& data,
arma::Row<size_t>& predictions,
arma::mat& predictionProbs) const
{
predictions.set_size(data.n_cols);
arma::mat logLikelihoods;
LogLikelihood(data, logLikelihoods);
arma::vec logProbX(data.n_cols); // log(Prob(X)) for each point.
for (size_t j = 0; j < data.n_cols; ++j)
{
logProbX(j) = log(arma::accu(exp(logLikelihoods.col(j))));
logLikelihoods.col(j) -= logProbX(j);
}
predictionProbs = arma::exp(logLikelihoods);
// Now calculate maximum probabilities for each point.
for (size_t i = 0; i < data.n_cols; ++i)
{
arma::uword maxIndex;
logLikelihoods.unsafe_col(i).max(maxIndex);
predictions[i] = maxIndex;
}
}
template<typename MatType>
template<typename Archive>
void ParallelNaiveBayesClassifier<MatType>::Serialize(Archive& ar,
const unsigned int /* version */)
{
ar & data::CreateNVP(means, "means");
ar & data::CreateNVP(variances, "variances");
ar & data::CreateNVP(probabilities, "probabilities");
}
} // namespace naive_bayes
} // namespace mlpack
#endif
@shikharbhardwaj
Copy link
Author

I made the changes on both the places and ran the classifier for the MiniBooNE dataset benchmark written before. It doesn't use more than 1 thread, and I can't figure out why. I have exported OMP_NUM_THREADS=4 before executing the program. Here are the runs from the current NBC and the above version:

λ nbc/MiniBooNE ∴ perf stat ./parallel -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.289867
Recall : 1
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001710s
[INFO ] Classify: 0.031765s
[INFO ] Train: 0.013651s
[INFO ] loading_data: 2.725474s
[INFO ] total_time: 2.773458s

Performance counter stats for './parallel -v':

   3054.002563      task-clock (msec)         #    1.099 CPUs utilized
            48      context-switches          #    0.016 K/sec
             4      cpu-migrations            #    0.001 K/sec
         2,370      page-faults               #    0.776 K/sec

11,01,85,23,328 cycles # 3.608 GHz
25,71,23,01,197 instructions # 2.33 insn per cycle
6,00,74,52,265 branches # 1967.075 M/sec
1,43,34,142 branch-misses # 0.24% of all branches

   2.779093968 seconds time elapsed

λ nbc/MiniBooNE ∴ perf stat ./serial -v
[INFO ] Loading './data/MiniBooNE_PID.txt' as raw ASCII formatted data. Size is 50x 130064.
Precision : 0.280485
Recall : 0.999285
[INFO ]
[INFO ] Execution parameters:
[INFO ] help: 0
[INFO ] info:
[INFO ] verbose: 1
[INFO ] version: 0
[INFO ] Program timers:
[INFO ] CV-partition: 0.001492s
[INFO ] Classify: 0.030842s
[INFO ] Train: 0.008971s
[INFO ] loading_data: 2.721527s
[INFO ] total_time: 2.763616s

Performance counter stats for './serial -v':

   2975.846853      task-clock (msec)         #    1.075 CPUs utilized
            50      context-switches          #    0.017 K/sec
             2      cpu-migrations            #    0.001 K/sec
         3,385      page-faults               #    0.001 M/sec

10,89,25,83,327 cycles # 3.660 GHz
25,82,14,31,973 instructions # 2.37 insn per cycle
6,02,90,69,939 branches # 2026.001 M/sec
1,43,14,804 branch-misses # 0.24% of all branches

   2.767538280 seconds time elapsed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment