Skip to content

Instantly share code, notes, and snippets.

@rehanguha
Created November 18, 2019 04:24
Show Gist options
  • Save rehanguha/feedfa45043a3485a8b3a9dbccab1df5 to your computer and use it in GitHub Desktop.
Save rehanguha/feedfa45043a3485a8b3a9dbccab1df5 to your computer and use it in GitHub Desktop.
Orthogonal Distance Regression from Scratch
# Use python 3 consistent printing and unicode
from __future__ import print_function
from __future__ import unicode_literals
import inspect
import matplotlib
from matplotlib import pyplot
import numpy
import scipy
import scipy.odr, scipy.special, scipy.stats
# Decide if Monte Carlo CDF values are desired
Run_Monte_Carlo_CDF_Estimator = True
# The default is set to just 11, so all the examples will run quickly in a few seconds. For serious work, this should be much larger.
Number_of_MC_iterations = 77
# You can ignore x uncertainties if desired
x_uncertainties_ignored = False
## define some possible functions to be fitted
## You may, of course, define your own.
def Cauchy(p,x) :
# Cauchy / Lorentzian / Breit-Wigner peak with contant background:
B = p[0] # Constant Background
x1_peak = p[1] # Height above/below background
x1 = p[2] # Central value
width_x1 = p[3] # Half-Width at Half Maximum
return ( B + x1_peak*(1/(1+((x-x1)/width_x1)**2)) )
def Gauss(p,x) :
# A gaussian or Normal peak with constant background:
B = p[0] # Constant Background
x1_peak = p[1] # Height above/below background
x1 = p[2] # Central value
width_x1 = p[3] # Standard Deviation
return ( B + x1_peak*numpy.exp(-(x-x1)**2/(2*width_x1**2)) )
def Semicircle(p,x) :
# The upper half of a circle:
R = p[0] # Radius of circle
x0 = p[1] # x coordinate of centre of circle
y0 = p[2] # y coordinate of centre of circle
from numpy import array, sqrt
y=[]
for i in range(len(x)) :
y_squared = R**2-(x[i]-x0)**2
if y_squared < 0 :
y.append(y0)
else :
y.append(sqrt(y_squared)+y0)
return array(y)
def Linear(p,x) :
# A linear function with:
# Constant Background : p[0]
# Slope : p[1]
return p[0]+p[1]*x
def Quadratic(p,x) :
# A quadratic function with:
# Constant Background : p[0]
# Slope : p[1]
# Curvature : p[2]
return p[0]+p[1]*x+p[2]*x**2
def Cubic(p,x) :
# A cubic function with:
# Constant Background : p[0]
# Slope : p[1]
# Curvature : p[2]
# 3rd Order coefficient : p[3]
return p[0]+p[1]*x+p[2]*x**2+p[3]*x**3
## Choose function and data to fit:
func = Semicircle
if func.__name__ == "Gauss" :
# Initial guesses for fit parameters
p_guess = (10.,400.,1173.9,0.4,)
# Gaussian data with large x uncertainties
# This data shows how ODR can fit data that would give great trouble to regular chi-squared fits (that ignore x errors).
data_file = "gauss_xy_errors_ugly.txt"
# Labels for plot axes
x_label = "energy [KeV]"
y_label = "Counts"
elif func.__name__ == "Semicircle" :
p_guess = (1,1,1)
# This data is a semicircle, and shows ODR doing the best it can with data that is a poor match to the fitting function
data_file = "semicircle.txt"
x_label = "x"
y_label = "y"
elif func.__name__ == "Cauchy" :
p_guess = (0,1,2, 1)
# This data is actually generated from a semicircle distribution,
# but the fit to a Cauchy is not bad, showing that the same data
# can sometimes be well fit by different distributions.
data_file = "semicircle.txt"
x_label = "x"
y_label = "y"
elif func.__name__ == "Quadratic" :
p_guess = (0,1,2)
# This data is parabola
data_file = "parabola_xy_errors.txt"
x_label = "x"
y_label = "y"
elif func.__name__ == "Cubic" :
p_guess = (0,1,0,0)
# This data is a cubic
data_file = "cubic_data.txt"
x_label = "x"
y_label = "y"
else :
# default is linear
func=Linear
p_guess = (2,2)
# Linear data with large x uncertainties
data_file = "linear_xy_errors.txt"
x_label = "x"
y_label = "y"
## Load data to fit
# Any lines in the data file starting with '#' are ignored
# "data" are values, "sigma" are uncertainties
x_data, x_sigma, y_data, y_sigma = numpy.loadtxt( data_file,
comments='#',
unpack = True)
if x_uncertainties_ignored:
# x uncertainties cannot be set exactly to zero without crashing the
# fit, but a tiny value seems to do the job.
x_sigma = len(x_data)*[1e-99]
# Load data for ODR fit
data = scipy.odr.RealData(x=x_data, y=y_data, sx=x_sigma, sy=y_sigma)
# Load model for ODR fit
model = scipy.odr.Model(func)
## Now fit model to data
# job=10 selects central finite differences instead of forward
# differences when doing numerical differentiation of function
# maxit is maximum number of iterations
# The scipy.odr documentation is a bit murky, so to be clear note
# that calling ODR automatically sets full_output=1 in the
# low level odr function.
fitted = scipy.odr.ODR(data, model, beta0=p_guess, maxit=5000,job=10)
output = fitted.run()
p = output.beta # 'beta' is an array of the parameter estimates
cov = output.cov_beta # parameter covariance matrix
#"Quasi-chi-squared" is defined to be the
# [total weighted sum of squares]/dof
# i.e. same as numpy.sum((residual/sigma_odr)**2)/dof or
# numpy.sum(((output.xplus-x)/x_sigma)**2
# +((y_data-output.y)/y_sigma)**2)/dof
# Converges to conventional chi-square for zero x uncertainties.
quasi_chisq = output.res_var
uncertainty = output.sd_beta # parameter standard uncertainties
# scipy.odr scales the parameter uncertainties by quasi_chisq.
# If the fit is poor, i.e. quasi_chisq is large, the uncertainties
# are scaled up, which makes sense. If the fit is too good,
# i.e. quasi_chisq << 1, it suggests that the uncertainties have
# been overestimated, but it seems risky to scale down the
# uncertainties. i.e. The uncertainties shouldn't be too sensistive
# to random fluctuations of the data, and if the uncertainties are
# overestimated, this should be understood and corrected
if quasi_chisq < 1.0 :
uncertainty = uncertainty/numpy.sqrt(quasi_chisq)
if False: # debugging print statements
print("sd_beta",output.sd_beta)
print("cov_beta",output.cov_beta)
print("delta",output.delta)
print("epsilon",output.epsilon)
print("res_var",output.res_var)
print("rel_error",output.rel_error)
######################## PRINT THE RESULTS ###########################
print("***********************************************************")
print(" ORTHOGONAL DISTANCE REGRESSION")
# print out version information for debugging
import sys
print("Python",sys.version)
print("Scipy:",scipy.__version__,", Numpy:",numpy.__version__,
", Matplotlib:",matplotlib.__version__, sep="")
print("***********************************************************\n")
print("ODR algorithm stop reason: " + output.stopreason[0])
print("\nFit {0} Data points from file: {1}"
.format(len(x_data),data_file))
print("To Model :")
print(" ",inspect.getsource(func))
if x_uncertainties_ignored:
print("** WARNING: x uncertainties set to zero in fit. **\n")
print("Estimated parameters and uncertainties")
for i in range(len(p)) :
print((" p[{0}] = {1:10.5g} +/- {2:10.5g}"+
" (Starting guess: {3:10.5g})").\
format(i,p[i],uncertainty[i],p_guess[i]))
# number of degrees of freedom for fit
dof = len(x_data) - len(p_guess)
# Note: odr, unlike curve_fit, returns the standard covariance matrix.
print("\nStandard Covariance Matrix : \n", cov, "\n")
print("\nCorrelation Matrix :")
for i,row in enumerate(cov):
for j in range(len(p)) :
print("{0:< 8.3g}".
format(cov[i,j]/numpy.sqrt(cov[i,i]*cov[j,j])), end="")
# Newbie Notes: "{0:< 8.3g}" left justifies output with
# space in front of positive numbers, with 3 sig figs;
# end="" suppresses new line.
print()
# Calculate initial residuals and adjusted error 'sigma_odr'
# for each data point
delta = output.delta # estimated x-component of the residuals
epsilon = output.eps # estimated y-component of the residuals
# (dx_star,dy_star) are the projections of x_sigma and y_sigma onto
# the residual line, i.e. he differences in x & y between the data
# point and the point where the orthogonal residual line intersects
# the ellipse' created by x_sigma & y_sigma.
dx_star = ( x_sigma*numpy.sqrt( ((y_sigma*delta)**2) /
( (y_sigma*delta)**2 + (x_sigma*epsilon)**2 ) ) )
dy_star = ( y_sigma*numpy.sqrt( ((x_sigma*epsilon)**2) /
( (y_sigma*delta)**2 + (x_sigma*epsilon)**2 ) ) )
sigma_odr = numpy.sqrt(dx_star**2 + dy_star**2)
# residual is positive if the point lies above the fitted curve,
# negative if below
residual = ( numpy.sign(y_data-func(p,x_data))
* numpy.sqrt(delta**2 + epsilon**2) )
print(("\nQuasi Chi-Squared/dof = {0:10.5f}," +
" Chi-Squared CDF = {1:10.5f}%").
format(quasi_chisq,
100.*float(scipy.special.chdtrc(dof,dof*quasi_chisq))))
print(" WARNING:Above CDF is not valid for large x uncertainties!")
print("\nTo use Monte Carlo simulation to more accurately estimate")
print(' CDF for large x uncertainties, re-run program with ')
print(' "Run_Monte_Carlo_CDF_Estimator = True" and')
print(' "Number_of_MC_iterations >= 1000."', end="")
print(' This may take some time\n')
print("Run_Monte_Carlo_CDF_Estimator = {0}"\
.format(Run_Monte_Carlo_CDF_Estimator))
if Run_Monte_Carlo_CDF_Estimator :
print("\n**** Running Monte Carlo CDF Estimator ****")
print("Number_of_MC_iterations = {0}"
.format(Number_of_MC_iterations), end="")
# Initialize Monte Carlo output distributions
x_dist = Number_of_MC_iterations*[None]
y_dist = Number_of_MC_iterations*[None]
p_dist = Number_of_MC_iterations*[None]
quasi_chisq_dist = Number_of_MC_iterations*[None]
# Initialize timing measurement
import time
start_time = time.clock()
for i in range(Number_of_MC_iterations) :
# Starting with the x and x uncertainty (x_sigma) values from
# the data, calculate Monte Carlo values assuming a normal
# gaussian distibution.
x_dist[i] = numpy.random.normal(x_data,x_sigma)
# Calculate y using Monte Carlo x, then smear by y uncertainty
y_dist[i] = numpy.random.normal(func(p,x_data),y_sigma)
# Fit the Monte Carlo x,y pseudo-data to the original model
data_dist = scipy.odr.RealData( x=x_dist[i], y=y_dist[i],
sx=x_sigma, sy=y_sigma)
model_dist = scipy.odr.Model(func)
fit_dist = scipy.odr.ODR(data_dist, model_dist, p, maxit=5000,
job=10)
output_dist = fit_dist.run()
p_dist[i] = output_dist.beta
quasi_chisq_dist[i] = output_dist.res_var
end_time = time.clock()
print(" simulations in {0} seconds.".\
format(end_time-start_time))
# Sort the simulated quasi-chi-squared values
quasi_chisq_sort = numpy.sort(quasi_chisq_dist)
# Find the index of the sorted simulated quasi-chi-squared value nearest to the data quasi-chi-squared value, to estimate the cdf
print("\nFraction of quasi-chisquared values larger" +
" than observed value:")
print(" Monte Carlo CDF = {0:6.1f}%".format(
100.*(1.0-numpy.abs(quasi_chisq_sort-quasi_chisq).argmin()
/float(Number_of_MC_iterations))))
print(" Minimum, Mean, and Maximum Quasi Chi-Squared values:",
end="")
print("{0:6.2g} {1:6.2g} {2:6.2g}".\
format(numpy.min(quasi_chisq_dist),
numpy.average(quasi_chisq_dist),
numpy.max(quasi_chisq_dist)))
print("\nAverage and Standard Deviation of MC Fit parameters")
ave_p = numpy.average(numpy.array(p_dist),axis=0)
std_p = numpy.std( numpy.array(p_dist),axis=0)
min_p = numpy.min( numpy.array(p_dist),axis=0)
max_p = numpy.max( numpy.array(p_dist),axis=0)
for i in range(len(p)) :
print ((" p[{0}] = {1:12.6g} +/- {2:12.6g} ;"
+ " (Min = {3:12.6f}, Max = {4:12.6f}").format(
i, ave_p[i], std_p[i], min_p[i], max_p[i] ))
print("\nCheck for any Fit Biases in MC Fit parameters"
+"(Significance and ratio)")
# Check if fit on Monte Carlo data tends to give an average output
# value for the parameters different from the input parameters.
for i in range(len(p)) :
print(" p[{0}] = {1:< 6.2f} ({2:<12.7g}+/-{3:<12.7g})"
.format(i, (ave_p[i]/p[i]-1)/
(std_p[i]/p[i]/numpy.sqrt(Number_of_MC_iterations-1)),
ave_p[i]/p[i],
std_p[i]/p[i]/numpy.sqrt(Number_of_MC_iterations-1)))
## Plot
# create figure with light gray background
fig = pyplot.figure(facecolor="0.98")
# 3 rows, 1 column, subplot 1
# 3 rows are declared, but there are only 2 plots; this leaves room for text in the empty 3rd row
fit = fig.add_subplot(211)
# remove tick labels from upper plot (for clean look)
fit.set_xticklabels( () )
pyplot.ylabel(y_label)
pyplot.title("Orthogonal Distance Regression Fit to Data")
# Plot data as red circles, and fitted function as (default) line.
# For a smooth look,generate many x values for plotting the model
stepsize = (max(x_data)-min(x_data))/1000.
margin = 50*stepsize
x_model = numpy.arange( min(x_data)-margin,max(x_data)+margin,
stepsize)
fit.plot(x_data,y_data,'ro', x_model, func(p,x_model),markersize=4)
# Add error bars on data as red crosses.
fit.errorbar(x_data, y_data, xerr=x_sigma, yerr=y_sigma, fmt='r+')
fit.set_yscale('linear')
# draw starting guess as dashed green line ('r-')
fit.plot(x_model, func(p_guess,x_model), 'g-', label="Start",
linestyle="--")
# output.xplus = x + delta
a = numpy.array([output.xplus,x_data])
# output.y = f(p, xfit), or y + epsilon
b = numpy.array([output.y,y_data])
fit.plot( numpy.array([a[0][0],a[1][0]]),
numpy.array([b[0][0],b[1][0]]),
'k-', label = 'Residuals')
for i in range(1,len(y_data)):
fit.plot( numpy.array([a[0][i],a[1][i]]),
numpy.array([b[0][i],b[1][i]]),
'k-')
fit.legend(loc='upper left')
fit.grid()
# separate plot to show residuals
residuals = fig.add_subplot(212) # 3 rows, 1 column, subplot 2
residuals.errorbar(x=x_data,y=residual,yerr=sigma_odr,
fmt="r+", label = "Residuals")
# make sure residual plot has same x axis as fit plot
residuals.set_xlim(fit.get_xlim())
# Draw a horizontal line at zero on residuals plot
pyplot.axhline(y=0, color='b')
# Label axes
pyplot.xlabel(x_label)
# These data look better in 'plain', not scientific, notation, and if the tick labels are not offset by a constant (as done by default).
pyplot.ticklabel_format(style='plain', useOffset=False, axis='x')
pyplot.ylabel(y_label)
residuals.grid()
pyplot.show()
@shortydutchie
Copy link

Is it worth calculating the standard deviations in the parameters from the covariance matrix as

stddev_cov = numpy.sqrt(numpy.diag(cov))

to get round the chisq scaling?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment