nir-krakauer/shapiro_wilk_test.m

## shapiro_wilk_test.m
## Copyright (C) 2017 Nir Krakauer
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; If not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn {Function File} {[@var{pval}, @var{W}] = } @
## shapiro_wilk_test (@var{x})
##
## Test the hypothesis that @var{x} is from a normal distribution
## using the Shapiro-Wilk test. If the returned @var{pval} is small, reject
## the hypothesis at the @var{pval}*100% level.
##
## The test statistic @var{W} is based on the correlation between the order statistics of the data
## with those expected from a normal distribution with the same mean.
##
## References: @*
## Patrick Royston (1992), "Approximating the Shapiro-Wilk W-test for non-normality", Statistics and Computing, 2(3): 117-119 @*
## Patrick Royston (1995), "AS R94: a remark on Algorithm AS 181: the W-test for normality", Journal of the Royal Statistical Society. Series C (Applied Statistics), 44(4): 547-551.
##
## @end deftypefn

## Author: Nir Krakauer <mail@nirkrakauer.net>


function [pval, W] = shapiro_wilk_test (x)

#the W statistic is sum(A .* x).^2 / sumsq(center(x))
#where A is a vector of expected values of order statistics from a standard normal distribution
#once x is ordered, A is an antisymmetric vector, with A(n+1-i) = -A(i), 1 <= i <= n, and A'*A = 1
#because of this, only half of A needs to be calculated; we consider a = A(n:-1:(n+1-floor(n/2)))

n = numel (x);
nh = floor (n/2);

if n < 3
  error ('The input vector x should have at least 3 elements')
endif

if n == 3
  a = sqrt(0.5); #exact
else
  #get an initial estimate of a
  m = -norminv(((1:nh)' - 0.375) / (n + 0.25));
  msq = 2*sumsq(m);
  mrms = sqrt (msq);
  #correction factors for initial elements of a (derived to be good approximations for 4 <= n <= 1000)
  rsn = 1 / sqrt(n);
  ac1 = m(1)/mrms + polyval ([-2.706056 4.434685 -2.07119 -0.147981 0.221157 0], rsn);
  if n <= 5
    phi = (msq - 2*m(1)^2) / (1 - 2*ac1^2);
    a = [ac1; m(2:nh)/sqrt(phi)];
  else
    ac2 = m(2)/mrms + polyval ([-3.582633 5.682633 -1.752461 -0.293762 0.042981 0], rsn);
    phi = (msq - 2*m(1)^2 - 2*m(2)^2) / (1 - 2*ac1^2 - 2*ac2^2);
    a = [ac1; ac2; m(3:nh)/sqrt(phi)];
  endif
endif

x = sort (x(:));
xm = mean (x);
W = sum(a .* (x(n:-1:n+1-nh) - x(1:nh))).^2 / sumsq(x - xm);

#p value for the W statistic being as small as it is for a normal distribution
if n == 3
  pval = (6/pi) * (asin(sqrt(W)) - asin(sqrt(0.75)));
elseif n <= 11
  gamma = 0.459*n - 2.273;
  w = -log(gamma - log(1 - W));
  mu = polyval ([-0.0006714 0.025054 -0.39978 0.5440], gamma);
  sigma = exp (polyval([-0.0020322 0.062767 -0.77857 1.3822], n));
  pval = normcdf ((mu - w) / sigma);
else
  nl = log (n);
  w = log (1 - W);
  mu = polyval ([0.0038915 -0.083751 -0.31082 -1.5861], nl);
  sigma = exp (polyval ([0.0030302 -0.082676 -0.4803], nl));
  pval = normcdf ((mu - w) / sigma);
endif

%!shared x, W, pval
%! x = [139 157 175 256 344 413 503 577 614 655 954 1392 1557 1648 1690 1994 2174 2206 3245 3510 3571 4354 4980 6084 8351]; #test data adapted from royston95
%! [pval, W] = shapiro_wilk_test (x);
%!assert (pval, 9.14E-4, 0.01E-4)
%!assert (W, 0.83467, 0.00001)
	## Copyright (C) 2017 Nir Krakauer
	##
	## This program is free software; you can redistribute it and/or modify
	## it under the terms of the GNU General Public License as published by
	## the Free Software Foundation; either version 3 of the License, or
	## (at your option) any later version.
	##
	## This program is distributed in the hope that it will be useful,
	## but WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	## GNU General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with this program; If not, see <http://www.gnu.org/licenses/>.

	## -- texinfo --
	## @deftypefn {Function File} {[@var{pval}, @var{W}] = } @
	## shapiro_wilk_test (@var{x})
	##
	## Test the hypothesis that @var{x} is from a normal distribution
	## using the Shapiro-Wilk test. If the returned @var{pval} is small, reject
	## the hypothesis at the @var{pval}*100% level.
	##
	## The test statistic @var{W} is based on the correlation between the order statistics of the data
	## with those expected from a normal distribution with the same mean.
	##
	## References: @*
	## Patrick Royston (1992), "Approximating the Shapiro-Wilk W-test for non-normality", Statistics and Computing, 2(3): 117-119 @*
	## Patrick Royston (1995), "AS R94: a remark on Algorithm AS 181: the W-test for normality", Journal of the Royal Statistical Society. Series C (Applied Statistics), 44(4): 547-551.
	##
	## @end deftypefn

	## Author: Nir Krakauer <mail@nirkrakauer.net>


	function [pval, W] = shapiro_wilk_test (x)

	#the W statistic is sum(A .* x).^2 / sumsq(center(x))
	#where A is a vector of expected values of order statistics from a standard normal distribution
	#once x is ordered, A is an antisymmetric vector, with A(n+1-i) = -A(i), 1 <= i <= n, and A'*A = 1
	#because of this, only half of A needs to be calculated; we consider a = A(n:-1:(n+1-floor(n/2)))

	n = numel (x);
	nh = floor (n/2);

	if n < 3
	error ('The input vector x should have at least 3 elements')
	endif

	if n == 3
	a = sqrt(0.5); #exact
	else
	#get an initial estimate of a
	m = -norminv(((1:nh)' - 0.375) / (n + 0.25));
	msq = 2*sumsq(m);
	mrms = sqrt (msq);
	#correction factors for initial elements of a (derived to be good approximations for 4 <= n <= 1000)
	rsn = 1 / sqrt(n);
	ac1 = m(1)/mrms + polyval ([-2.706056 4.434685 -2.07119 -0.147981 0.221157 0], rsn);
	if n <= 5
	phi = (msq - 2m(1)^2) / (1 - 2ac1^2);
	a = [ac1; m(2:nh)/sqrt(phi)];
	else
	ac2 = m(2)/mrms + polyval ([-3.582633 5.682633 -1.752461 -0.293762 0.042981 0], rsn);
	phi = (msq - 2m(1)^2 - 2m(2)^2) / (1 - 2ac1^2 - 2ac2^2);
	a = [ac1; ac2; m(3:nh)/sqrt(phi)];
	endif
	endif

	x = sort (x(:));
	xm = mean (x);
	W = sum(a .* (x(n:-1:n+1-nh) - x(1:nh))).^2 / sumsq(x - xm);

	#p value for the W statistic being as small as it is for a normal distribution
	if n == 3
	pval = (6/pi) * (asin(sqrt(W)) - asin(sqrt(0.75)));
	elseif n <= 11
	gamma = 0.459*n - 2.273;
	w = -log(gamma - log(1 - W));
	mu = polyval ([-0.0006714 0.025054 -0.39978 0.5440], gamma);
	sigma = exp (polyval([-0.0020322 0.062767 -0.77857 1.3822], n));
	pval = normcdf ((mu - w) / sigma);
	else
	nl = log (n);
	w = log (1 - W);
	mu = polyval ([0.0038915 -0.083751 -0.31082 -1.5861], nl);
	sigma = exp (polyval ([0.0030302 -0.082676 -0.4803], nl));
	pval = normcdf ((mu - w) / sigma);
	endif

	%!shared x, W, pval
	%! x = [139 157 175 256 344 413 503 577 614 655 954 1392 1557 1648 1690 1994 2174 2206 3245 3510 3571 4354 4980 6084 8351]; #test data adapted from royston95
	%! [pval, W] = shapiro_wilk_test (x);
	%!assert (pval, 9.14E-4, 0.01E-4)
	%!assert (W, 0.83467, 0.00001)