mkolod/two_pass_variance_vs_welfords_method.py

## two_pass_variance_vs_welfords_method.py
# The two-pass and Welford's method implementations were taken
# verbatim from Wikipedia:
#
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
#
# Note that in addition to the numerical stability benefits of Welford's
# algorithm, the other benefit is that it's an online algorithm, so
# variance update with additional data has an incremental cost,
# without the need to recompute on all the data.
#
# For a detailed discussion of Welford's algorithm, see
# B. P. Welford (1962). "Note on a method for calculating
# corrected sums of squares and products". Technometrics 4(3): 419-420.
#
# For numerical comparisons across different algorithms, see:
#
# 1) Ling, Robert F. (1974). Comparison of Several Algorithms for Computing
#    Sample Means and Variances. Journal of the American Statistical Association,
#    Vol. 69, No. 348, 859-866. doi:10.2307/2286154
#
# 2) Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1983).
#    Algorithms for Computing the Sample Variance: Analysis and Recommendations.
#    The American Statistician 37, 242-247.

from random import random, seed

seed(42)

def two_pass(data):
    n = sum1 = sum2 = 0

    for x in data:
        n += 1
        sum1 += x

    mean = sum1 / n

    for x in data:
        sum2 += (x - mean)*(x - mean)

    variance = sum2 / (n - 1)
    return variance


def welfords_method(data):
    n = 0
    mean = M2 = 0.0

    for x in data:
        n += 1
        delta = x - mean
        mean += delta/n
        delta2 = x - mean
        M2 += delta*delta2

    if n < 2:
        return float('nan')
    else:
        return M2 / (n - 1)

def compare(data, name):
    print("\n" + name)
    tp = two_pass(data)
    wm = welfords_method(data)
    print("two_pass: %f" % tp)
    print("welfords_method: %f" % wm)
    print("absolute difference: %f" % abs(tp - wm))


if __name__ == '__main__':
    num_el = 1000000
    small_range = [random() for x in range(num_el)]
    big_range = [small_range[x] + 1e12 for x in range(num_el)]
    compare(small_range, "small range")
    compare(big_range, "big range")
	# The two-pass and Welford's method implementations were taken
	# verbatim from Wikipedia:
	#
	# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
	#
	# Note that in addition to the numerical stability benefits of Welford's
	# algorithm, the other benefit is that it's an online algorithm, so
	# variance update with additional data has an incremental cost,
	# without the need to recompute on all the data.
	#
	# For a detailed discussion of Welford's algorithm, see
	# B. P. Welford (1962). "Note on a method for calculating
	# corrected sums of squares and products". Technometrics 4(3): 419-420.
	#
	# For numerical comparisons across different algorithms, see:
	#
	# 1) Ling, Robert F. (1974). Comparison of Several Algorithms for Computing
	# Sample Means and Variances. Journal of the American Statistical Association,
	# Vol. 69, No. 348, 859-866. doi:10.2307/2286154
	#
	# 2) Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1983).
	# Algorithms for Computing the Sample Variance: Analysis and Recommendations.
	# The American Statistician 37, 242-247.

	from random import random, seed

	seed(42)

	def two_pass(data):
	n = sum1 = sum2 = 0

	for x in data:
	n += 1
	sum1 += x

	mean = sum1 / n

	for x in data:
	sum2 += (x - mean)*(x - mean)

	variance = sum2 / (n - 1)
	return variance


	def welfords_method(data):
	n = 0
	mean = M2 = 0.0

	for x in data:
	n += 1
	delta = x - mean
	mean += delta/n
	delta2 = x - mean
	M2 += delta*delta2

	if n < 2:
	return float('nan')
	else:
	return M2 / (n - 1)

	def compare(data, name):
	print("\n" + name)
	tp = two_pass(data)
	wm = welfords_method(data)
	print("two_pass: %f" % tp)
	print("welfords_method: %f" % wm)
	print("absolute difference: %f" % abs(tp - wm))


	if __name__ == '__main__':
	num_el = 1000000
	small_range = [random() for x in range(num_el)]
	big_range = [small_range[x] + 1e12 for x in range(num_el)]
	compare(small_range, "small range")
	compare(big_range, "big range")