Created
April 18, 2019 06:35
-
-
Save mikkopitkanen/da8c949571225e9c7093665c9803726e to your computer and use it in GitHub Desktop.
This script demontrates the importance of proper treatment of data uncertainties when performing linear regression for statistical inference (how strongly y depends on x).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Demonstrate how to calculate various linear regression estimates. | |
Copyright (C) 2019 Mikko Pitkanen | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
""" | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import scipy.odr | |
import pystan | |
import statsmodels.formula.api as smf | |
import scipy as sc | |
from sklearn.decomposition import PCA | |
def bayes_ols_fit(xi, yi): | |
"""Perform non-weighted OLS regression using Bayesian inference. | |
This model does not take into account the uncertainties in the data, for a | |
model that properly takes into account the uncertainties see, for example, | |
Stan manual | |
https://mc-stan.org/docs/2_19/stan-users-guide/bayesian-measurement-error-model.html | |
(valid on 2019-05-18) | |
""" | |
# define stan model | |
model = """ | |
data { | |
int<lower=0> N; // number of cases | |
vector[N] x; | |
vector[N] y; // outcome (variate) | |
real<lower=0> sigma; // outcome noise | |
} | |
parameters { | |
real intercept; | |
real slope; | |
} | |
model { | |
y ~ normal(intercept + slope * x, sigma); | |
} | |
""" | |
n = len(xi) | |
ind = np.arange(n) | |
# formalize input data | |
data = { | |
'N': n, | |
'x': xi[ind], | |
'y': yi[ind], | |
'sigma': np.std(yi)} | |
sm_OLS = pystan.StanModel(model_code=model) | |
# make OLS fit to get initial guesses for slope and intercept | |
slope_ols, intercept_ols = np.polyfit(x, y, 1) | |
fit = sm_OLS.sampling( | |
data=data, | |
iter=1000, | |
chains=4, | |
init=lambda: { | |
'x': xi[ind], | |
'y': yi[ind], | |
'slope': slope_ols, | |
'intercept': intercept_ols}, | |
algorithm="NUTS", | |
n_jobs=4) | |
# find the index for maximum a posteriori (MAP) values | |
samples = fit.extract(permuted=True) | |
lp = samples['lp__'] | |
MAPindex = np.argmax(lp) | |
# use MAP values for slope and intercept | |
slope = samples['slope'][MAPindex] | |
intercept = samples['intercept'][MAPindex] | |
return slope, intercept | |
def bivariate_fit(xi, yi, dxi, dyi, ri=0.0, b0=1.0, maxIter=1e6): | |
"""Perform bivariate regression by York et al. 2004. | |
This is an implementation of the line fitting algorithm presented in: | |
York, D et al., Unified equations for the slope, intercept, and standard | |
errors of the best straight line, American Journal of Physics, 2004, 72, | |
3, 367-375, doi = 10.1119/1.1632486 | |
See especially Section III and Table I. The enumerated steps below are | |
citations to Section III | |
Parameters: | |
xi, yi np.array, x and y values | |
dxi, dyi np.array, errors for the data points xi, yi | |
ri float, correlation coefficient for the weights | |
b0 float, initial guess for slope | |
maxIter float, maximum allowed number of iterations. this is to escape | |
possible non-converging iteration loops | |
Returns: | |
b slope estimate | |
a intercept estimate | |
S goodness-of-fit estimate | |
cov covariance matrix of the estimated slope and intercept values | |
""" | |
# (1) Choose an approximate initial value of b | |
# make OLS fit to get the initial guesses for slope | |
slope_ols, intercept_ols = np.polyfit(x, y, 1) | |
b = slope_ols | |
# (2) Determine the weights wxi, wyi, for each point. | |
wxi = 1.0 / dxi**2.0 | |
wyi = 1.0 / dyi**2.0 | |
alphai = (wxi * wyi)**0.5 | |
b_diff = 999.0 | |
# tolerance for the fit, when b changes by less than tol for two | |
# consecutive iterations, fit is considered found | |
tol = 1.0e-8 | |
# iterate until b changes less than tol | |
iIter = 1 | |
while (abs(b_diff) >= tol) & (iIter <= maxIter): | |
b_prev = b | |
# (3) Use these weights wxi, wyi to evaluate Wi for each point. | |
Wi = (wxi * wyi) / (wxi + b**2.0 * wyi - 2.0*b*ri*alphai) | |
# (4) Use the observed points (xi ,yi) and Wi to calculate x_bar and | |
# y_bar, from which Ui and Vi , and hence betai can be evaluated for | |
# each point | |
x_bar = np.sum(Wi * xi) / np.sum(Wi) | |
y_bar = np.sum(Wi * yi) / np.sum(Wi) | |
Ui = xi - x_bar | |
Vi = yi - y_bar | |
betai = Wi * (Ui / wyi + b*Vi / wxi - (b*Ui + Vi) * ri / alphai) | |
# (5) Use Wi, Ui, Vi, and betai to calculate an improved estimate of b | |
b = np.sum(Wi * betai * Vi) / np.sum(Wi * betai * Ui) | |
# (6) Use the new b and repeat steps (3), (4), and (5) until successive | |
# estimates of b agree within some desired tolerance tol | |
b_diff = b - b_prev | |
iIter += 1 | |
# (7) From this final value of b, together with the final x_bar and y_bar, | |
# calculate a from | |
a = y_bar - b * x_bar | |
# Goodness of fit | |
S = np.sum(Wi * (yi - b*xi - a)**2.0) | |
# (8) For each point (xi, yi), calculate the adjusted values xi_adj | |
xi_adj = x_bar + betai | |
# (9) Use xi_adj, together with Wi, to calculate xi_adj_bar and thence ui | |
xi_adj_bar = np.sum(Wi * xi_adj) / np.sum(Wi) | |
ui = xi_adj - xi_adj_bar | |
# (10) From Wi , xi_adj_bar and ui, calculate sigma_b, and then sigma_a | |
# (the standard uncertainties of the fitted parameters) | |
sigma_b = np.sqrt(1.0 / np.sum(Wi * ui**2)) | |
sigma_a = np.sqrt(1.0 / np.sum(Wi) + xi_adj_bar**2 * sigma_b**2) | |
# calculate covariance matrix of slope and intercept (York et al., | |
# Section II) | |
cov = -xi_adj_bar * sigma_b**2 | |
# [[var(b), cov], [cov, var(a)]] | |
cov_matrix = np.array( | |
[[sigma_b**2, cov], [cov, sigma_a**2]]) | |
if iIter <= maxIter: | |
return b, a, S, cov_matrix | |
else: | |
print("bivariate_fit() exceeded maximum number of iterations, " + | |
"maxIter = {:}".format(maxIter)) | |
return np.nan, np.nan, np.nan, np.nan | |
def deming_fit(xi, yi): | |
"""Perform Deming regression. | |
Nomenclature follows: | |
Francq, Bernard G., and Bernadette B. Govaerts. 2014. "Measurement Methods | |
Comparison with Errors-in-Variables Regressions. From Horizontal to | |
Vertical OLS Regression, Review and New Perspectives." Chemometrics and | |
Intelligent Laboratory Systems. Elsevier. | |
doi:10.1016/j.chemolab.2014.03.006. | |
Parameters: | |
xi, yi np.array, x and y values | |
Returns: | |
slope regression slope estimate | |
intercept regression intercept estimate | |
""" | |
Sxx = np.sum((xi - np.mean(xi)) ** 2) | |
Syy = np.sum((yi - np.mean(yi)) ** 2) | |
Sxy = np.sum((xi - np.mean(xi)) * (yi - np.mean(yi))) | |
lambda_xy = (np.var(yi) / np.size(yi)) / (np.var(xi) / np.size(xi)) | |
slope = (Syy - lambda_xy * Sxx + | |
np.sqrt((Syy - lambda_xy * Sxx) ** 2 + | |
4 * lambda_xy * (Sxy ** 2))) / (2 * Sxy) | |
intercept = np.mean(yi) - slope * np.mean(xi) | |
return slope, intercept | |
def odr_fit(xi, yi, dxi, dyi): | |
"""Perform weighted orthogonal distance regression. | |
https://docs.scipy.org/doc/scipy/reference/odr.html (valid on 2019-04-16) | |
Parametes: | |
xi, yi np.array, x and y values | |
dxi, dxy np.array, x and y errors | |
Returns: | |
slope regression slope estimate | |
intercept regression intercept estimate | |
""" | |
def f(B, x): | |
"""Define linear function y = a * x + b for ODR. | |
Parameters: | |
B [slope, intercept] | |
x x values | |
""" | |
return B[0] * x + B[1] | |
# define the model for ODR | |
linear = scipy.odr.Model(f) | |
# formalize the data | |
data = scipy.odr.RealData( | |
xi, | |
yi, | |
sx=dxi, | |
sy=dyi) | |
# make OLS fit to get initial guesses for slope and intercept | |
slope_ols, intercept_ols = np.polyfit(x, y, 1) | |
# instantiate ODR with your data, model and initial parameter estimate | |
# use OLS regression coefficients as initial guess | |
odr = scipy.odr.ODR( | |
data, | |
linear, | |
beta0=[slope_ols, intercept_ols]) | |
# run the fit | |
output = odr.run() | |
slope, intercept = output.beta | |
return slope, intercept | |
def pca_fit(xi, yi): | |
"""Estimate principal component regression fit to xi, yi data. | |
See eg. https://shankarmsy.github.io/posts/pca-vs-lr.html (valid on | |
2019-04-17) | |
Parameters: | |
xi, yi x and y data points | |
Returns: | |
a y-intercept, y = a + bx | |
b slope | |
Example: | |
[slope, intercept] = pca_fit( xi, yi, dxi, dyi, ri, b0 ) | |
""" | |
# | |
xy = np.array([xi, yi]).T | |
pca = PCA(n_components=1) | |
xy_pca = pca.fit_transform(xy) | |
xy_n = pca.inverse_transform(xy_pca) | |
slope = (xy_n[0, 1] - xy_n[1, 1])/(xy_n[0, 0] - xy_n[1, 0]) | |
intercept = xy_n[0, 1] - slope * xy_n[0, 0] | |
return slope, intercept | |
def quantile_fit(xi, yi, q=0.5): | |
"""Perform quantile regression. | |
See for instance: | |
https://www.statsmodels.org/dev/examples/notebooks/generated/quantile_regression.html | |
(valid on 2091-04-16) | |
Parametes: | |
xi, yi np.array, x and y values | |
Returns: | |
slope regression slope estimate | |
intercept regression intercept estimate | |
""" | |
data = {'xi': xi, 'yi': yi} | |
df = pd.DataFrame.from_dict(data=data) | |
mod = smf.quantreg('yi ~ xi', df) | |
res = mod.fit(q=q) | |
# return slope, intercept, covariance_matrix | |
return res.params['xi'], res.params['Intercept'], res.cov_params().values | |
if __name__ == "__main__": | |
"""Demonstrate statistical inference using linear regression line fitting. | |
Purpose: make linear regression with different estimators on x and y data | |
with uncertainties. The correct linear model is y = 1.5x - 2.0. Try and | |
answer: | |
- which of the estimators is the best and why? | |
- which of the methods consider x and y uncertainty and which ones don't? | |
Also, try to replace x, y, dx and dy with your own data. | |
Please note, that some methods, like OLS, have limitations that make it | |
unsuitable/not optimal for this particular task! | |
The script has been tested in a conda environment (see | |
https://www.anaconda.com/distribution/#download-section for more info on | |
that): | |
conda create --name ito python=3.7 numpy==1.16.2 matplotlib==3.0.3 pandas==0.24.2 ipython==7.4.0 pystan==2.17.1.0 scipy==1.2.1 scikit-learn==0.20.3 statsmodels==0.9.0 | |
On linux you can run this from the command line by calling: | |
python regression_estimators.py | |
You can run this in Ipython by calling: | |
run regression_estimators.py | |
""" | |
# define absolute and relative standard uncertainties for x and y data sets | |
sigma_x_abs = 0.1 | |
sigma_x_rel = 0.05 | |
sigma_y_abs = 0.1 | |
sigma_y_rel = 0.05 | |
# define the real regression parameters | |
slope_true=1.5 | |
intercept_true=-2 | |
# define test data set points. These were generated with | |
# 1. create normally distributed and independent random x values | |
# 2. y = slope_true * x + intercept_true | |
# 3. add normally distributed and independent random noise to x and y | |
x = np.array([ | |
7.02490389, 5.84673882, 6.22362901, 5.89447501, 6.50522957, | |
5.80298616, 6.17497626, 6.57451761, 6.47010046, 6.04077582, | |
5.90118102, 6.77270208, 6.43396724, 6.41136767, 6.12493598, | |
5.90716534, 6.32037763, 7.39491283, 6.36049059, 5.9670787 , | |
6.85141919, 6.26910599, 6.20254179, 6.9836126 , 6.63848388, | |
6.21000692, 6.23215349, 6.2068118 , 6.39700798, 5.68460809, | |
6.0957604 , 5.93433827, 6.92329796, 6.87485541, 6.64441035, | |
6.5876272 , 6.21395565, 6.97018765, 5.8405509 , 6.68689768, | |
6.55696236, 5.91300654, 5.77200607, 6.18620691, 6.46252992, | |
5.84408498, 5.72175502, 6.28586177, 6.1426537 , 5.97624839, | |
7.2909262 , 6.26629957, 6.35857082, 6.00486819, 5.96392117, | |
6.79158893, 6.88007737, 5.79147038, 6.32788946, 5.89282374, | |
5.246736 , 6.79574812, 6.57403906, 6.14307375, 7.00910025, | |
5.7563269 , 6.351342 , 6.53075042, 5.71545834, 6.30847149, | |
7.02490349, 6.40364356, 6.16509938, 6.4619477 , 6.70890128, | |
6.51323415, 6.99526207, 5.98790113, 5.92062987, 6.07047262, | |
7.05354862, 5.71384054, 6.60230794, 7.0169052 , 6.36480226, | |
6.31785604, 5.61791288, 6.85937139, 5.75865116, 5.72959174, | |
5.90952266, 6.42005849, 6.93056586, 6.01429019, 6.9796715 , | |
6.94304459, 6.75550702, 5.66799426, 6.98226771, 6.04554234]) | |
y = np.array([ | |
8.40544072, 7.23576875, 7.36491546, 6.0339046 , 7.97447602, | |
7.15119055, 6.54041287, 7.17955333, 7.61995614, 7.45436687, | |
6.51994675, 6.86866468, 7.93039991, 7.96141096, 6.74008807, | |
6.24162408, 7.56592469, 8.90243085, 7.93080636, 7.69373893, | |
8.1495254 , 7.31618462, 7.38623682, 8.27756635, 7.26490068, | |
7.62419581, 8.09272363, 6.83289432, 7.00903454, 7.32198232, | |
7.76544704, 7.86794507, 7.34049199, 7.16680021, 7.28097398, | |
7.1300533 , 7.56470235, 8.53067913, 6.5722756 , 8.35793814, | |
7.85134993, 6.28578289, 6.78504232, 7.46187614, 7.63509705, | |
7.14787352, 7.76011323, 7.73277699, 6.61017633, 7.04707694, | |
8.11976918, 7.57491045, 7.6502606 , 7.81891365, 7.5169907 , | |
7.10958076, 8.10664908, 6.41070742, 7.42201405, 7.1440822 , | |
6.71524939, 8.29542569, 7.40644049, 6.88359516, 8.10013957, | |
6.17323241, 6.89164089, 8.18856187, 6.43704836, 7.1734189 , | |
7.33072932, 8.21214643, 7.73751715, 7.73084165, 8.5996884 , | |
8.08276146, 7.83624525, 7.24484867, 6.62742944, 5.95489133, | |
8.05221471, 6.09695074, 8.934238 , 8.4620742 , 7.03271364, | |
6.62512029, 7.76597935, 7.76624445, 6.84164444, 7.15060009, | |
7.05616176, 7.62173155, 8.63441307, 6.77385575, 7.61571327, | |
7.87055929, 8.07943385, 6.48806751, 7.88899205, 7.61359413]) | |
# estimate total standard uncertanties for each data point | |
dx = np.sqrt((x * sigma_x_rel)**2 + sigma_x_abs**2) | |
dy = np.sqrt((y * sigma_y_rel)**2 + sigma_y_abs**2) | |
# calculate total relative uncertainty [%] | |
u_x = np.mean(dx / x) * 100 | |
u_y = np.mean(dy / y) * 100 | |
parameters = dict() | |
label = 'Truth: y={:1.2f}x{:+1.2f}'.format(slope_true, intercept_true) | |
parameters = {label: (intercept_true, slope_true, np.nan, np.nan)} | |
# bivariate, York et al 2004 | |
slope, intercept, S, cov = bivariate_fit( | |
x, y, dx, dy, b0=0.0) | |
label = 'York, 2004 :y={:1.2f}x{:+1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# OLS | |
slope, intercept = np.polyfit(x, y, 1) | |
S = np.nan | |
cov = np.nan | |
label = 'OLS: y={:1.2f}x{:+1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# Bayes OLS. | |
slope, intercept = bayes_ols_fit(x, y) | |
S = np.nan | |
cov = np.nan | |
label = 'Bayes OLS: y={:1.2f}x+{:1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# Deming | |
slope, intercept = deming_fit(x, y) | |
S = np.nan | |
cov = np.nan | |
label = 'Deming: y={:1.2f}x{:+1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# ODR, weighted orthogonal distance regression | |
slope, intercept = odr_fit(x, y, dx, dy) | |
S = np.nan | |
cov = np.nan | |
label = 'ODR: y={:1.2f}x{:+1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# quantile | |
slope, intercept, cov = quantile_fit(x, y) | |
S = np.nan | |
label = 'Quantile: y={:1.2f}x+{:1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
# pca | |
slope, intercept = pca_fit(x, y) | |
S = np.nan | |
cov = np.nan | |
label = 'PCA: y={:1.2f}x{:+1.2f}'.format(slope, intercept) | |
parameters.update({label: (intercept, slope, S, cov)}) | |
fig = plt.figure(figsize=(9, 9)) | |
ax = fig.add_subplot(111) | |
# plot error bar | |
ax.errorbar( | |
x, y, | |
xerr=dx, yerr=dy, | |
fmt='o', | |
errorevery=5, | |
linestyle='None', | |
marker='.', | |
ecolor='k', | |
elinewidth=0.5, | |
barsabove=True, | |
label=None) | |
xlim = np.array(ax.get_xlim()) | |
ylim = np.array(ax.get_ylim()) | |
for label, (intercept, slope, S, cov) in parameters.items(): | |
ax.plot(xlim, slope*xlim + intercept, '-', label=label) | |
plt.suptitle('ITO Homework Fig. 1, synthetic data\nThe correct relationship is y={:1.2f}x{:+1.2f}'.format(slope_true, intercept_true), fontsize=16) | |
ax.set_xlabel('x, total uncertainty {:1.0f}%'.format(u_x)) | |
ax.set_ylabel('y, total uncertainty {:1.0f}%'.format(u_y)) | |
ax.set_xlim(xlim) | |
ax.set_ylim(ylim) | |
ax.grid(b=True) | |
ax.legend(loc='lower right') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment