Last active
November 2, 2017 09:14
-
-
Save mkolod/f15727a381c537a87f4368f4ecf39022 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The two-pass and Welford's method implementations were taken | |
# verbatim from Wikipedia: | |
# | |
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance | |
# | |
# Note that in addition to the numerical stability benefits of Welford's | |
# algorithm, the other benefit is that it's an online algorithm, so | |
# variance update with additional data has an incremental cost, | |
# without the need to recompute on all the data. | |
# | |
# For a detailed discussion of Welford's algorithm, see | |
# B. P. Welford (1962). "Note on a method for calculating | |
# corrected sums of squares and products". Technometrics 4(3): 419-420. | |
# | |
# For numerical comparisons across different algorithms, see: | |
# | |
# 1) Ling, Robert F. (1974). Comparison of Several Algorithms for Computing | |
# Sample Means and Variances. Journal of the American Statistical Association, | |
# Vol. 69, No. 348, 859-866. doi:10.2307/2286154 | |
# | |
# 2) Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1983). | |
# Algorithms for Computing the Sample Variance: Analysis and Recommendations. | |
# The American Statistician 37, 242-247. | |
from random import random, seed | |
seed(42) | |
def two_pass(data): | |
n = sum1 = sum2 = 0 | |
for x in data: | |
n += 1 | |
sum1 += x | |
mean = sum1 / n | |
for x in data: | |
sum2 += (x - mean)*(x - mean) | |
variance = sum2 / (n - 1) | |
return variance | |
def welfords_method(data): | |
n = 0 | |
mean = M2 = 0.0 | |
for x in data: | |
n += 1 | |
delta = x - mean | |
mean += delta/n | |
delta2 = x - mean | |
M2 += delta*delta2 | |
if n < 2: | |
return float('nan') | |
else: | |
return M2 / (n - 1) | |
def compare(data, name): | |
print("\n" + name) | |
tp = two_pass(data) | |
wm = welfords_method(data) | |
print("two_pass: %f" % tp) | |
print("welfords_method: %f" % wm) | |
print("absolute difference: %f" % abs(tp - wm)) | |
if __name__ == '__main__': | |
num_el = 1000000 | |
small_range = [random() for x in range(num_el)] | |
big_range = [small_range[x] + 1e12 for x in range(num_el)] | |
compare(small_range, "small range") | |
compare(big_range, "big range") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output:
small range
two_pass: 0.083229
welfords_method: 0.083229
absolute difference: 0.000000
big range
two_pass: 0.328626
welfords_method: 0.083236
absolute difference: 0.245390