Created
January 27, 2012 12:48
-
-
Save dhoerl/1688626 to your computer and use it in GitHub Desktop.
BEST FIT NORMAL CURVE TO A HISTOGRAM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEST FIT NORMAL CURVE TO A HISTOGRAM | |
I had a business need to provide the mean value of a process based on its bell-shaped histogram. Now, normally that would be an easy thing to do—there is a standard formula to get the mean and standard deviation. | |
However, in my case, the histogram has a skew. One side does not tail off to zero, but to some value midway between the peak value and zero. | |
After lots of googling, I found a paper written in 1993 by Brown and Hwang titled How To Approximate a Histogram by a Normal Density (http://www.jstor.org/pss/2685281). After purchasing a copy, I inquired if the author’s had software that implemented their algorithms. When the answer came back, I had to dig deep and turn their formulas into code, One integral had no series sum solution, so I had to really really dig deep and figure out how to do the conversion myself. | |
By intent I did all the conversion on my own time, and after I got everything working I sent the C Code to the authors in case anyone else ever asks. | |
In the process of trying to solve the integral, I found (but did not use) the Math Help Forum (http://www.mathhelpforum.com/), which looked like it would be of great help to people in need of assistance. | |
However, for sure, I could not have done this without Wikipedia. The GNU Scientific Library was of great help for its minimization routines. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//--------------------------------------------------------------------------------------- | |
// Histogram.c created by David Hoerl on Mon 16-Jun-2008 | |
// | |
// Copyright (c) 2008 by David Hoerl. All rights reserved. | |
// | |
// Permission to use, copy, modify and distribute this software and its documentation | |
// is hereby granted, provided that both the copyright notice and this permission | |
// notice appear in all copies of the software, derivative works or modified versions, | |
// and any portions thereof, and that both notices appear in supporting documentation, | |
// and that credit is given to David Hoerl in all documents and publicity | |
// pertaining to direct or indirect use of this code or its derivatives. | |
// | |
// THIS IS EXPERIMENTAL SOFTWARE AND IT IS KNOWN TO HAVE BUGS, SOME OF WHICH MAY HAVE | |
// SERIOUS CONSEQUENCES. THE COPYRIGHT HOLDER ALLOWS FREE USE OF THIS SOFTWARE IN ITS | |
// "AS IS" CONDITION. THE COPYRIGHT HOLDER DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY | |
// DAMAGES WHATSOEVER RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE | |
// OR OF ANY DERIVATIVE WORK. | |
//--------------------------------------------------------------------------------------- | |
// Program Notes: | |
// The code contained in this file implements the functions and algorithms from | |
// "How to Approximate a Histogram by Normal density" by Lawrence Brown and Gene Hwang, | |
// published in the American Statistician, November 1993, Vol. 47, No. 4 | |
// | |
// Equations 3.8, 3.9, 4.1, and 4.2 are as presented in the paper. Equation 3.1, D(mu, sigma), | |
// has been converted to series sum. | |
// | |
// Note that equations 3.8 and 3.8 operate on a normalized histogram, while equations 4.1 and | |
// 4.2 operate on the original data that produced the histogram. Strictly as an exercise, | |
// two new functions, 4.1x and 4.2x are provided that operate as 4.1/4.2 but on the histogram. | |
// Essentially they assume that each histogram bar represents the appropriate number of data | |
// points having a "x" value centered in the bar. | |
// | |
// This code requires GNU's Scientific Library available from the GNU site. Parts of the "solver" | |
// routine are based on the example code presented in the library's documentation. | |
// | |
// To build this program: | |
// 1) obtain GNU's Scientific Library, configure, build and install it in /usr/local/lib (the default) | |
// 2) compile the program: gcc -o hist -std=c99 histogram.c -lc -lgsl | |
// 3) run it as "hist" | |
// | |
// Please forward errors or suggestions to dhoerl at mac dot com | |
// | |
// uses tabs set at 4 spaces per | |
#ifdef __OBJC__ | |
#import <Foundation/Foundation.h> | |
#endif | |
#import <stdio.h> | |
#import <stdint.h> | |
#import <stdbool.h> | |
#import <strings.h> | |
#import <gsl/gsl_errno.h> | |
#import <gsl/gsl_math.h> | |
#import <gsl/gsl_min.h> // minimization | |
#import <gsl/gsl_histogram.h> | |
#define MAX_ITER 100 // GSL suggestion | |
#define MAX_ERR 0.001 // GSL suggestion | |
#define DELTA 0.25 // 25% | |
#define MIN_BND(x) ((x) * (1.0 - DELTA)) | |
#define MAX_BND(x) ((x) * (1.0 + DELTA)) | |
struct _parm; // need this to get the typedefs below to compile | |
typedef double (*fFunc)(double mu, struct _parm *p); | |
typedef struct _parm { | |
double mu; | |
double muMin; | |
double muMax; | |
double sigma; | |
double sigmaMin; | |
double sigmaMax; | |
double *hist; | |
double M; // step width | |
double E_0; // left endpoint of first bar | |
double E_M; // right endpoint of last bar | |
size_t count; | |
fFunc f1Func; | |
fFunc f2Func; | |
bool dataset; // is this a histogram or a dataset | |
char debug; // set debug for subroutines | |
} parms, *parmsP; | |
typedef struct { | |
double val; | |
double minVal; | |
double maxVal; | |
bool valid; | |
} runParms; | |
// The following histogram was created by crudely drawing a bell curve on a piece of graph paper. | |
// Obtaining a good mu fit in the face of skewing was important to me, thus the option to run the | |
// program with the left tail. | |
// | |
#define SKEW_MUL 1 | |
#define SKEW_VAL 8 | |
#define NUM_HIST 25 | |
double hist[NUM_HIST] = { | |
#if 1 | |
6.1 + SKEW_MUL * SKEW_VAL, //0.1, | |
6.1, //0.6, | |
6.1, //1.2, | |
6.1, //2.0, | |
6.1, //3.1, | |
6.1, //4.4, | |
#else | |
0.1, | |
0.6, | |
1.2, | |
2.0, | |
3.1, | |
4.4, | |
#endif | |
6.1, | |
8.0, | |
12.0, | |
15.50, | |
17.75, | |
19.0, | |
19.35, | |
19.15, | |
18.0, | |
14.0, | |
10.0, | |
6.75, | |
5.3, | |
4.0, | |
3.0, | |
2.0, | |
1.35, | |
0.65, | |
0.15 | |
}; | |
// Performs one iteration on mu and then sigma | |
extern int solver(parmsP p); | |
// normalize the histogram | |
extern void scaleData(double *array, size_t arraySize); | |
// determine initial values as suggest in Chapter 6 | |
extern void initStats(parmsP p); | |
// Equation 3.1 | |
extern double D(double mu, double sigma, parmsP p); | |
extern double eq38(double mu, parmsP p); | |
extern double eq39(double mu, parmsP p); | |
extern double eq41(double mu, parmsP p); | |
extern double eq42(double mu, parmsP p); | |
// Similar to 4.1/4.2, but operates on a histogram, not the data set. | |
extern double eq41x(double mu, parmsP p); | |
extern double eq42x(double mu, parmsP p); | |
// Standard formulas on Normal Density and Cumulative Normal Density | |
extern double theta(double t, double mu, double sigma); | |
extern double cdf(double t, double mu, double sigma); | |
// Cumulative Density of theta squared (needed to compute D) | |
extern double cdfSqrd(double t, double mu, double sigma); | |
// routines as described in Chaper 6 | |
extern runParms suggestMu(double mu, parmsP p); | |
extern runParms suggestSigma(double mu, parmsP p); | |
// utility routine to see how D changes as sigma is varied | |
extern void dumper(parmsP p); | |
int main (int argc, const char * argv[]) | |
{ | |
gsl_histogram *origHist, *scaledHist; | |
parms p; | |
int debug; | |
debug = 0; // set it to progressively higher values to see more output | |
bzero(&p, sizeof(p)); | |
p.debug = debug; | |
srand48(0); // seed random number generator | |
#ifdef __OBJC__ | |
NSAutoreleasePool *pool = [NSAutoreleasePool new]; | |
#endif | |
origHist = gsl_histogram_alloc(NUM_HIST); | |
gsl_histogram_set_ranges_uniform(origHist, 0.0, (double)NUM_HIST); | |
bcopy(hist, origHist->bin, NUM_HIST*sizeof(double)); | |
scaledHist = gsl_histogram_alloc(NUM_HIST); | |
bcopy(hist, scaledHist->bin, NUM_HIST*sizeof(double)); | |
bcopy(origHist->range, scaledHist->range, (NUM_HIST+1)*sizeof(double)); | |
printf("Histogram: mu=%f sigma=%f\n", gsl_histogram_mean(origHist), gsl_histogram_sigma(origHist)); | |
scaleData(scaledHist->bin, NUM_HIST); | |
// First run uses equations 3.8 and 3.9 | |
p.dataset = false; | |
p.hist = scaledHist->bin; | |
p.count = NUM_HIST; | |
p.M = 1.0; // bar width | |
p.E_0 = 0.0; // first histogram bar's left side "x" | |
p.f1Func = eq38; | |
p.f2Func = eq39; | |
initStats(&p); | |
solver(&p); | |
if(debug) dumper(&p); | |
// Second run uses modified 4.1 and 4.2 equations | |
p.f1Func = eq41x; | |
p.f2Func = eq42x; | |
initStats(&p); | |
solver(&p); | |
if(debug) dumper(&p); | |
// Third run creates a data set that resembles the histogram, for use with eq41 and 42 | |
double *data, *dataP, sum; | |
size_t dataSetSize; | |
sum = gsl_histogram_sum(origHist); | |
dataSetSize = (size_t)sum+1; | |
data = calloc(sizeof(double), dataSetSize); | |
dataP = data; | |
for(size_t i=0; i<NUM_HIST; ++i) { | |
size_t j = (size_t)hist[i]; // round down | |
//printf("Data[o] = %d\n", j+1); | |
do { | |
*dataP++ = (double)i + 0.5; | |
} while(j--); | |
} | |
dataSetSize = dataP - data; | |
//for(size_t i=0; i<dataSetSize; ++i) { | |
// printf("Data[%d]=%.1lf\n", i, data[i]); | |
//} | |
p.dataset = true; | |
p.hist = data; | |
p.count = dataSetSize; | |
p.f1Func = eq41; | |
p.f2Func = eq42; | |
initStats(&p); | |
solver(&p); | |
if(debug) dumper(&p); | |
#ifdef __OBJC__ | |
[pool drain]; | |
#endif | |
return 0; | |
} | |
void scaleData(double *array, size_t arraySize) { | |
double total = 0.0; | |
for(size_t i=0; i<arraySize; ++i) { | |
total += array[i]; | |
} | |
for(size_t i=0; i<arraySize; ++i) { | |
array[i] /= total; | |
} | |
} | |
void initStats(parmsP p) | |
{ | |
double total, sum, sumSqrs; | |
int mean, firstQ, thirdQ; | |
double minX, maxX; | |
// normalized histograms have a total of 1.0; this allows non-normalized histograms and datasets | |
minX = DBL_MAX; | |
maxX = DBL_MIN; | |
sum = sumSqrs = 0.0; | |
for(uint32_t i=0; i<p->count; ++i) { | |
sum += p->hist[i]; | |
sumSqrs += p->hist[i] * p->hist[i]; | |
if(p->hist[i] < minX) minX = p->hist[i]; | |
if(p->hist[i] > maxX) maxX = p->hist[i]; | |
} | |
if(p->dataset == false) { | |
p->E_M = p->E_0 + p->M * (double)p->count; | |
total = 0.0; | |
mean = firstQ = thirdQ = 0; | |
for(uint32_t i=p->count; i>0; ) { | |
i--; | |
total += p->hist[i]/sum; | |
if(total >= 0.75 && firstQ == 0) { | |
firstQ = i; | |
} | |
if(total >= 0.50 && mean == 0) { | |
mean = i; | |
} | |
if(total >= 0.25 && thirdQ == 0) { | |
thirdQ = i; | |
} | |
} | |
p->sigma = ((double)(thirdQ - firstQ) * p->M)/2.0; | |
p->mu = p->E_0 + p->M*((double)mean + 0.5); | |
} else { | |
double var; | |
p->mu = sum/p->count; | |
var = (sumSqrs - sum*p->mu)/(double)p->count; | |
p->sigma = sqrt(var); | |
p->M = (maxX - minX)/(double)p->count; | |
p->E_0 = minX; | |
p->E_M = maxX; | |
} | |
p->sigmaMin = p->M; | |
p->sigmaMax = p->sigma * 2.0; | |
p->muMin = p->mu - p->sigma; | |
p->muMax = p->mu + p->sigma; | |
if(p->debug>1) { | |
printf("mu=%f muMin=%f muMax=%f ", p->mu, p->muMin, p->muMax); | |
printf("sigma=%f sigmaMin=%f sigmaMax=%f ", p->sigma, p->sigmaMin, p->sigmaMax); | |
printf("E_0=%f M=%f E_M=%f\n", p->E_0, p->M, p->E_M); | |
} | |
} | |
runParms suggestMu(double mu, parmsP p) | |
{ | |
double startD, startVal, interval; | |
double leftVal, leftD, rightVal, rightD; | |
runParms r; | |
bzero(&r, sizeof(r) ); // kch == keep compiler happy | |
for(int i=0; i<100; ++i) { | |
// first try to bound where we are with intervals | |
interval = p->M; | |
while(true) { | |
if((mu-interval) < p->muMin || (mu+interval) > p->muMax) { | |
break; | |
} | |
startVal = p->f1Func(mu, p); | |
startD = D(mu, p->sigma, p); | |
leftVal = p->f1Func(mu-interval, p); | |
leftD = D(mu-interval, p->sigma, p); | |
rightVal = p->f1Func(mu+interval, p); | |
rightD = D(mu+interval, p->sigma, p); | |
if( | |
(leftVal > startVal && rightVal > startVal) && | |
(leftD >= startD && rightD >= startD) | |
) { | |
r.valid = true; | |
r.val = mu; | |
r.minVal = mu-interval; | |
r.maxVal = mu+interval; | |
//printf("SUGGEST MU: %f %f %f\n", r.val, r.minVal, r.maxVal); | |
return r; | |
} | |
interval += p->M/4.0; | |
} | |
// OK, that didn't work - lets try some random values | |
if(p->debug>2) printf("RANDOM MU!\n"); | |
mu = drand48() * (p->muMax - p->muMin) + p->muMin; | |
} | |
r.valid = false; | |
return r; | |
} | |
runParms suggestSigma(double sigma, parmsP p) | |
{ | |
double startD, startVal, interval; | |
double leftVal, leftD, rightVal, rightD; | |
runParms r; | |
bzero(&r, sizeof(r) ); // kch == keep compiler happy | |
for(int i=0; i<100; ++i) { | |
// first try to bound where we are with intervals | |
interval = 0; | |
while(true) { | |
//printf("suggestSigma: LIMIT=%f min=%f cur=%f max=%f LIMIT=%f\n", p->sigmaMin, sigma-interval, sigma, sigma+interval, p->sigmaMax); | |
if((sigma-interval) < p->sigmaMin || (sigma+interval) > p->sigmaMax) { | |
break; | |
} | |
startVal = p->f2Func(sigma, p); | |
startD = D(p->mu, sigma, p); | |
leftVal = p->f2Func(sigma-interval, p); | |
leftD = D(p->mu, sigma-interval, p); | |
rightVal = p->f2Func(sigma+interval, p); | |
rightD = D(p->mu, sigma+interval, p); | |
//printf(" [val=%f D=%f] > [val=%f D=%f] < [val=%f D=%f]\n", leftVal, leftD, startVal, startD, rightVal, rightD); | |
if( | |
(leftVal > startVal && rightVal > startVal) && | |
(leftD >= startD && rightD >= startD) | |
) { | |
r.valid = true; | |
r.val = sigma; | |
r.minVal = sigma-interval; | |
r.maxVal = sigma+interval; | |
//printf("SUGGEST SIGMA: %f %f %f\n", r.val, r.minVal, r.maxVal); | |
return r; | |
} | |
interval += p->M/4.0; | |
} | |
// OK, that didn't work - lets try some random values | |
if(p->debug>2) printf("RANDOM SIGMA!\n"); | |
sigma = drand48() * (p->sigmaMax - p->sigmaMin) + p->sigmaMin; | |
} | |
r.valid = false; | |
return r; | |
} | |
int solver(parmsP p) { | |
int status; | |
int iter; | |
gsl_min_fminimizer *s1, *s2; | |
double a, b; | |
double lastMu, lastSigma; | |
gsl_function F1, F2; | |
runParms r; | |
F1.params = p; | |
F2.params = p; | |
F1.function = (void *)p->f1Func; | |
F2.function = (void *)p->f2Func; | |
// choices are gsl_min_fminimizer_brent or gsl_min_fminimizer_goldensection | |
s1 = gsl_min_fminimizer_alloc (gsl_min_fminimizer_brent); | |
s2 = gsl_min_fminimizer_alloc (gsl_min_fminimizer_brent); | |
for(int it=0; it<MAX_ITER; ++it) { | |
lastMu = p->mu; | |
lastSigma = p->sigma; | |
/* | |
* Calculate mu - this function appears to be very uniform | |
*/ | |
iter = 0; | |
r = suggestMu(p->mu, p); | |
if(r.valid != true) { | |
printf("ERROR GETTING MU!\n"); | |
exit(0); | |
} | |
if(p->debug) printf("\nITERATION: %d mu=%f sigma=%f D=%f\n", it, r.val, p->sigma, D(p->mu, p->sigma, p) ); | |
gsl_min_fminimizer_set(s1, &F1, r.val, r.minVal, r.maxVal); | |
a = gsl_min_fminimizer_x_lower(s1); | |
b = gsl_min_fminimizer_x_upper(s1); | |
//printf ("MU using %s method\n", gsl_min_fminimizer_name (s1)); | |
if(p->debug>1) printf ("%5s [%9s, %9s] %9s %9s\n", "iter", "lower", "upper", "min", "err(est)"); | |
if(p->debug>1) printf ("%5d [%.7f, %.7f] %.7f %.7f\n", iter, a, b, p->mu, b - a); | |
do | |
{ | |
iter++; | |
status = gsl_min_fminimizer_iterate(s1); | |
p->mu = gsl_min_fminimizer_x_minimum(s1); | |
a = gsl_min_fminimizer_x_lower(s1); | |
b = gsl_min_fminimizer_x_upper(s1); | |
status = gsl_min_test_interval(a, b, MAX_ERR, 0.0); | |
if (status == GSL_SUCCESS) if(p->debug>1) printf ("Converged:\n"); | |
if(p->debug>1) printf ("%5d [%.7f, %.7f] %.7f %.7f\n", iter, a, b, p->mu, b - a); | |
} | |
while (status == GSL_CONTINUE && iter < MAX_ITER); | |
if(status != GSL_SUCCESS) { | |
printf("Failed to converge on mu\n"); | |
return -1; | |
} | |
/* | |
* Discover a reasonable starting value for mu | |
* If its the first iteration, or the second iteration with noisy or skewed data, | |
* the seed may be far away from a minimum, and so the initial values for it | |
* need tweaking, or the gnu minimizer fails | |
*/ | |
r = suggestSigma(p->sigma, p); | |
if(r.valid != true) { | |
printf("ERROR GETTING SIGMA!\n"); | |
exit(0); | |
} | |
if(p->debug) printf("PHASE II: mu=%f sigma=%f D=%f\n", p->mu, r.val, D(p->mu, r.val, p) ); | |
/* | |
* Calculate sigma | |
*/ | |
iter = 0; | |
gsl_min_fminimizer_set(s2, &F2, r.val, r.minVal, r.maxVal); | |
a = gsl_min_fminimizer_x_lower(s2); | |
b = gsl_min_fminimizer_x_upper(s2); | |
if(p->debug>1) printf ("%5s [%9s, %9s] %9s %9s\n", "iter", "lower", "upper", "min", "err(est)"); | |
if(p->debug>1) printf ("%5d [%.7f, %.7f] %.7f %.7f\n", iter, a, b, p->sigma, b - a); | |
do | |
{ | |
iter++; | |
status = gsl_min_fminimizer_iterate(s2); | |
p->sigma = gsl_min_fminimizer_x_minimum(s2); | |
a = gsl_min_fminimizer_x_lower(s2); | |
b = gsl_min_fminimizer_x_upper(s2); | |
status = gsl_min_test_interval(a, b, MAX_ERR, 0.0); | |
if (status == GSL_SUCCESS) if(p->debug>1) printf ("Converged:\n"); | |
if(p->debug>1) printf ("%5d [%.7f, %.7f] %.7f %.7f\n", iter, a, b, p->sigma, b - a); | |
} | |
while (status == GSL_CONTINUE && iter < MAX_ITER); | |
if(status != GSL_SUCCESS) { | |
printf("Failed to converge on sigma\n"); | |
return -1; | |
} | |
if((fabs(lastMu - p->mu) + fabs(lastSigma - p->sigma)) < MAX_ERR*2) { | |
printf("Convergence: mu=%f sigma=%f D=%f\n", p->mu, p->sigma, D(p->mu, p->sigma, p)); | |
return 0; | |
} | |
lastSigma = p->sigma; | |
} | |
gsl_min_fminimizer_free (s1); | |
gsl_min_fminimizer_free (s2); | |
return 0; | |
} | |
double eq38(double mu, parmsP p) { | |
double total; | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
total += p->hist[i] * (theta((double)(i+1), mu, p->sigma) - theta((double)(i+0), mu, p->sigma)); | |
} | |
return fabs(total); | |
} | |
double eq39(double sigma, parmsP p) { | |
double total, frontPart; | |
//printf("eq39: mu=%f sigma=%f ", p->mu, sigma ); | |
#if 1 // simpler to understand | |
frontPart = 4.0 * sigma * sqrt(M_PI); | |
#else | |
frontPart = sigma * 8.0/M_2_SQRTPI; // 2/sqrt(M_PI) | |
#endif | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
double l = (double)(i+1); | |
double r = (double)i; | |
total += p->hist[i] * (l*theta(l, p->mu, sigma) -r* theta(r, p->mu, sigma)); | |
} | |
total *= frontPart; | |
total -= 1.0; | |
//printf("ret=%f\n", fabs(total) ); | |
return fabs(total); | |
} | |
double eq41(double mu, parmsP p) { | |
double total; | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
total += (p->hist[i] - mu) * theta(p->hist[i], mu, p->sigma); | |
} | |
//printf("eq41(%f,%f)=%f\n", mu, p->sigma, fabs(total) ); | |
return fabs(total); | |
} | |
double eq42(double sigma, parmsP p) { | |
double total, frontPart, val; | |
//printf("eq42: mu=%f sigma=%f ", p->mu, sigma ); | |
#if 0 // simpler to understand | |
frontPart = 4.0 * sigma * sqrt(M_PI); | |
#else | |
frontPart = sigma * 8.0/M_2_SQRTPI; // 2/sqrt(M_PI) | |
#endif | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
val = (p->hist[i] - p->mu); | |
val *= val; | |
val /= sigma * sigma; | |
val = 1.0 - val; | |
val *= theta(p->hist[i], p->mu, sigma); | |
total += val; | |
// printf(" val[%d]=%.3f total=%.3f\n", i, val*frontPart, total*frontPart); | |
} | |
total *= frontPart; | |
total -= (double)p->count; | |
// printf("eq(mu=%.2f sigma=%.2f)=%.3f\n", p->mu, sigma, fabs(total) ); | |
return fabs(total); | |
} | |
double eq41x(double mu, parmsP p) { | |
double total; | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
total += p->hist[i]*((double)i+0.5 - mu) * theta((double)i+0.5, mu, p->sigma); | |
} | |
//printf("ret=%f\n", fabs(total) ); | |
return fabs(total); | |
} | |
double eq42x(double sigma, parmsP p) { | |
double total, frontPart, val; | |
//printf("eq42x: mu=%f sigma=%f ", p->mu, sigma ); | |
#if 0 // simpler to understand | |
frontPart = 4.0 * sigma * sqrt(M_PI); | |
#else | |
frontPart = sigma * 8.0/M_2_SQRTPI; // 2/sqrt(M_PI) | |
#endif | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
val = ((double)i+0.5 - p->mu); | |
val *= val; | |
val /= sigma * sigma; | |
val = 1.0 - val; | |
val *= theta((double)i+0.5, p->mu, sigma); | |
total += p->hist[i]*val; | |
//printf(" val[%d]=%.3f total=%.3f\n", i, val*frontPart, total*frontPart); | |
} | |
total *= frontPart; | |
total -= 1.0; | |
//printf("eq(mu=%.2f sigma=%.2f)=%.3f\n", p->mu, sigma, fabs(total) ); | |
return fabs(total); | |
} | |
/* | |
* D(mu, sigma) = integral of difference of histogram to normal distribution curve. | |
* Integral can be converted to the sum for each bar i with left x=E[i], right= E[i+1], and value=hist[i]: | |
* 1/(sigma*sqrt(2.0*M_PI))( cdf(E[i+1], mu, sigma/sqrt(2)) - cdf(E[i], mu, sigma/sqrt(2)) | |
* - (cdf(E[i+1], mu, sigma) - cdf(E[i], mu, sigma))*2.0 *hist[i] | |
* + hist[i] * hist[i] * (E[i+1] - E[i]) | |
* | |
*/ | |
double D(double mu, double sigma, parmsP p) | |
{ | |
double total, val, barHi, barLo; | |
// this function is immaterial for datasets | |
if(p->dataset) return 0.0; | |
total=0.0; | |
for(size_t i=0; i<p->count; ++i) { | |
// using fraction of progress avoid rounding errors that would be introduced by just adding in M | |
barLo = p->E_0 + ((p->E_M - p->E_0)*(double)i)/(double)p->count; | |
barHi = barLo + p->M; | |
val = cdfSqrd(barHi, mu, sigma) - cdfSqrd(barLo, mu, sigma); // first term of equation | |
val += p->hist[i] * p->hist[i] * p->M; // last term of equation | |
val -= (cdf(barHi, mu, sigma) - cdf(barLo, mu, sigma))* 2.0 * p->hist[i]; | |
total += val; | |
} | |
return total; | |
} | |
double theta(double t, double mu, double sigma) { | |
double pwr, eval; | |
pwr = t - mu; | |
pwr *= pwr; | |
pwr /= 2.0*sigma*sigma; | |
pwr *= -1.0; | |
eval = exp(pwr); | |
#if 0 // simpler to understand | |
eval /= sigma * sqrt(2 * M_PI); | |
#else | |
eval /= sigma; | |
eval *= (M_2_SQRTPI * M_SQRT1_2)/2.0; | |
#endif | |
return eval; | |
} | |
double cdf(double t, double mu, double sigma) { | |
double val, erfval; | |
val = t - mu; | |
val /= sigma * M_SQRT2; | |
erfval = erf(val); | |
val = (1.0 + erfval)/2.0; | |
//printf("ERF(t=%f mu=%f sig=%f)=%f\n", t, mu, sigma, val); | |
return val; | |
} | |
double cdfSqrd(double t, double mu, double sigma) { | |
double val, erfval; | |
val = t - mu; | |
val /= sigma; | |
erfval = erf(val); | |
#if 0 // simpler to understand | |
val = (1.0 + erfval)/(sigma*2.0*M_SQRT2*sqrt(2.0*M_PI)); | |
#else | |
val = (1.0 + erfval)/(sigma*2.0*M_SQRT2*2.0/(M_2_SQRTPI * M_SQRT1_2)); | |
#endif | |
return val; | |
} | |
void dumper(parmsP p) | |
{ | |
double startD, startVal, interval; | |
interval = p->M/4.0; | |
for(double i=p->sigmaMin; i<p->sigmaMax; i += interval) { | |
// first try to bound where we are with intervals | |
startVal = p->f2Func(i, p); | |
startD = D(p->mu, i, p); | |
printf("sigma=%f val=%f D=%f\n", i, startVal, startD); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment