Skip to content

Instantly share code, notes, and snippets.

@RhysU
Last active November 27, 2018 18:53
Show Gist options
  • Save RhysU/5196928 to your computer and use it in GitHub Desktop.
Save RhysU/5196928 to your computer and use it in GitHub Desktop.
In his article "Accurately computing running variance", John D. Cook presents a small class for computing a running mean and variance using an algorithm with favorable numerical properties reported by Knuth in TAOCP. Departing from Cook's sample, my rewrite below includes templating on the floating point type, permitting accumulating multiple st…
//--------------------------------------------------------------------------
//
// Copyright (C) 2012, 2013 Rhys Ulerich
// Please see http://pecos.ices.utexas.edu for more information on PECOS.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
//--------------------------------------------------------------------------
#ifndef RUNNING_STATISTICS_HPP
#define RUNNING_STATISTICS_HPP
/** @file
* Provides statistical accumulators for special cases of interest.
* Though Boost Accumulators is generally preferred, some use cases can
* benefit from custom logic.
*/
#include <algorithm>
#include <cassert>
#include <cmath>
#include <limits>
// TODO Assertions in running_statistics complain loudly when N == 0
// TODO Permit serialization and reconstruction using a raw buffer
// TODO Provide MPI reduction operator using serialization capability
// TODO Defend against overflowing the counter, especially for MPI reduction
/**
* Accumulates running minimum, maximum, mean, and variance details from a data
* stream. Adapted from http://www.johndcook.com/standard_deviation.html.
* Extended to track a fixed number of distinct quantities with concurrently
* provided samples and to permit merging statistics from multiple instances.
* Storage overhead reduced relative to Cook's presentation.
*
* @tparam Real Floating point type used for input and accumulation
* @tparam N Number of distinct quantities to simultaneously track
*/
template <typename Real, std::size_t N>
class running_statistics
{
public:
/** Default constructor. */
running_statistics();
/** Provide quantity samples in locations <tt>x[0], ..., x[N-1]</tt>. */
running_statistics& operator()(const Real x[N]);
/** Incorporate running information from another instance. */
running_statistics& operator()(const running_statistics& o);
/** Obtain the running number of samples provided thus far. */
std::size_t count() const
{ return n_; }
/** Obtain the minimum sample observed for quantity \c i. */
Real min(std::size_t i) const
{ assert(i < N); return min_[i]; }
/** Obtain the maximum sample observed for quantity \c i. */
Real max(std::size_t i) const
{ assert(i < N); return max_[i]; }
/** Obtain the running mean for quantity \c i. */
Real avg(std::size_t i) const
{ assert(i < N); return M_[i]; }
/** Obtain the running sample variance for quantity \c i. */
Real var(std::size_t i) const
{ assert(i < N); return n_ == 1 ? 0 : S_[i] / (n_ - 1); }
/** Obtain the running sample standard deviation for quantity \c i. */
Real std(std::size_t i) const
{ using std::sqrt; return sqrt(var(i)); }
/** Reset the instance to its newly constructed state. */
void clear();
/** Number of quantities per <tt>operator()(const T*)</tt> invocation */
static const std::size_t static_size = N; // Compile time
/** Number of quantities per <tt>operator()(const T*)</tt> invocation */
std::size_t size() const { return static_size; } // Run time
private:
Real M_[N], S_[N], min_[N], max_[N];
std::size_t n_;
};
template <typename Real, std::size_t N>
running_statistics<Real,N>::running_statistics()
{
clear();
}
template <typename Real, std::size_t N>
running_statistics<Real,N>& running_statistics<Real,N>::operator()(
const Real x[N])
{
// Algorithm from Knuth TAOCP vol 2, 3rd edition, page 232.
// Knuth shows better behavior than Welford 1962 on test data.
using std::max;
using std::min;
++n_;
if (n_ > 1) { // Second and subsequent invocation
const Real inv_n = Real(1) / n_;
for (std::size_t i = 0; i < N; ++i) {
const Real d = x[i] - M_[i];
M_[i] += d * inv_n;
S_[i] += d * (x[i] - M_[i]);
min_[i] = min(min_[i], x[i]);
max_[i] = max(max_[i], x[i]);
}
} else { // First invocation requires special treatment
for (std::size_t i = 0; i < N; ++i) {
M_[i] = x[i];
S_[i] = 0;
min_[i] = x[i];
max_[i] = x[i];
}
}
return *this;
}
template <typename Real, std::size_t N>
running_statistics<Real,N>& running_statistics<Real,N>::operator()(
const running_statistics& o)
{
using std::max;
using std::min;
const std::size_t total = n_ + o.n_; // How many samples in combined data?
if (o.n_ == 0) return *this; // Other contains no data; run away
if (n_ == 0) return *this = o; // We contain no data; simply copy
assert(total >= 1); // Voila, degeneracy sidestepped
// Combine statistics from both instances into this
for (std::size_t i = 0; i < N; ++i) {
const Real dM = M_[i] - o.M_[i]; // Cancellation issues?
M_[i] = (n_ * avg(i) + o.n_ * o.avg(i)) / total;
S_[i] = (n_ == 1 ? 0 : S_[i])
+ (o.n_ == 1 ? 0 : o.S_[i])
+ ((dM * dM) * (n_ * o.n_)) / total;
min_[i] = min(min_[i], o.min_[i]);
max_[i] = max(max_[i], o.max_[i]);
}
n_ = total;
return *this;
}
template <typename Real, std::size_t N>
void running_statistics<Real,N>::clear()
{
using std::numeric_limits;
for (std::size_t i = 0; i < N; ++i)
M_[i] = S_[i] = min_[i] = max_[i] = numeric_limits<Real>::quiet_NaN();
n_ = 0;
}
#endif // RUNNING_STATISTICS_HPP
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment