zorbax/normality_tests.py

## normality_tests.py
#!/usr/bin/env python3

from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std

import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import anderson


"""
Normality test: techniques that can be used to check if a
data sample deviates from a Gaussian distribution.
"""

seed(1)

# generate univariate observations
data = 5 * randn(100) + 50

# summarize
print('mean=%.3f stdv=%.3f' % (mean(data), std(data)))

# histogram plot
plt.hist(data)
plt.show()

        ─
# Q-Q plot
"""
This plot generates its own sample of the idealized distribution
that we are comparing with. The idealized samples are divided
into groups called quantiles. Each data point in the sample is
paired with a similar member from the idealized distribution
at the same cumulative distribution.

The resulting points are plotted as a scatter plot with the
idealized value on the x-axis and the data sample on the y-axis.
"""

qqplot(data, line='s')
plt.show()

"""
The QQ plot shows the scatter plot of points in a diagonal line,
closely fitting the expected diagonal pattern for a sample
from a Gaussian distribution.
"""

# Shapiro-Wilk Test
# The function returns both the W-statistic calculated
# by the test and the p-value.

stat, p = shapiro(data)

print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
alpha = 0.05
if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0 )')

# D’Agostino’s K^2 test

# Calculates summary statistics from the data
# (kurtosis and skewness), to determine if the data
# distribution departs from the normal distribution

"""
* Skew:  is a quantification of how much a distribution is
         pushed left or right, a measure of asymmetry
         in the distribution.

* Kurtosis: quantifies how much of the distribution is in the tail.
"""


# normality test
stat, p = normaltest(data)

print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
alpha = 0.05

if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

# p-value is interpreted against an alpha of 5% and finds
# that the test dataset does not significantly deviate from normal.

# Anderson-Darling Test
"""
Anderson-Darling Test is a statistical test that can be used to evaluate
whether a data sample comes from one of among many known data samples. Is
a modified version of a nonparametric goodness-of-fit statistical test
called the Kolmogorov-Smirnov test.

A feature of the Anderson-Darling test is that it returns a list of
critical values rather than a single p-value. Critical values in a
statistical test are a range of pre-defined significance boundaries
at which the H0 can be failed to be rejected if the calculated
statistic is less than the critical value.
"""


# normality test
result = anderson(data)

print('Statistic: %.3f' % result.statistic)

p = 0

for i in range(len(result.critical_values)):
	sl, cv = result.significance_level[i], result.critical_values[i]
	if result.statistic < result.critical_values[i]:
		print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
	else:
		print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))

# We can interpret the results by failing to reject the null hypothesis
# that the data is normal if the calculated test statistic is less than
# the critical value at a chosen significance level.
	#!/usr/bin/env python3

	from numpy.random import seed
	from numpy.random import randn
	from numpy import mean
	from numpy import std

	import matplotlib.pyplot as plt
	from statsmodels.graphics.gofplots import qqplot
	from scipy.stats import shapiro
	from scipy.stats import normaltest
	from scipy.stats import anderson


	"""
	Normality test: techniques that can be used to check if a
	data sample deviates from a Gaussian distribution.
	"""

	seed(1)

	# generate univariate observations
	data = 5 * randn(100) + 50

	# summarize
	print('mean=%.3f stdv=%.3f' % (mean(data), std(data)))

	# histogram plot
	plt.hist(data)
	plt.show()

	─
	# Q-Q plot
	"""
	This plot generates its own sample of the idealized distribution
	that we are comparing with. The idealized samples are divided
	into groups called quantiles. Each data point in the sample is
	paired with a similar member from the idealized distribution
	at the same cumulative distribution.

	The resulting points are plotted as a scatter plot with the
	idealized value on the x-axis and the data sample on the y-axis.
	"""

	qqplot(data, line='s')
	plt.show()

	"""
	The QQ plot shows the scatter plot of points in a diagonal line,
	closely fitting the expected diagonal pattern for a sample
	from a Gaussian distribution.
	"""

	# Shapiro-Wilk Test
	# The function returns both the W-statistic calculated
	# by the test and the p-value.

	stat, p = shapiro(data)

	print('Statistics=%.3f, p=%.3f' % (stat, p))

	# interpret
	alpha = 0.05
	if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
	else:
	print('Sample does not look Gaussian (reject H0 )')

	# D’Agostino’s K^2 test

	# Calculates summary statistics from the data
	# (kurtosis and skewness), to determine if the data
	# distribution departs from the normal distribution

	"""
	* Skew: is a quantification of how much a distribution is
	pushed left or right, a measure of asymmetry
	in the distribution.

	* Kurtosis: quantifies how much of the distribution is in the tail.
	"""


	# normality test
	stat, p = normaltest(data)

	print('Statistics=%.3f, p=%.3f' % (stat, p))

	# interpret
	alpha = 0.05

	if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
	else:
	print('Sample does not look Gaussian (reject H0)')

	# p-value is interpreted against an alpha of 5% and finds
	# that the test dataset does not significantly deviate from normal.

	# Anderson-Darling Test
	"""
	Anderson-Darling Test is a statistical test that can be used to evaluate
	whether a data sample comes from one of among many known data samples. Is
	a modified version of a nonparametric goodness-of-fit statistical test
	called the Kolmogorov-Smirnov test.

	A feature of the Anderson-Darling test is that it returns a list of
	critical values rather than a single p-value. Critical values in a
	statistical test are a range of pre-defined significance boundaries
	at which the H0 can be failed to be rejected if the calculated
	statistic is less than the critical value.
	"""


	# normality test
	result = anderson(data)

	print('Statistic: %.3f' % result.statistic)

	p = 0

	for i in range(len(result.critical_values)):
	sl, cv = result.significance_level[i], result.critical_values[i]
	if result.statistic < result.critical_values[i]:
	print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
	else:
	print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))

	# We can interpret the results by failing to reject the null hypothesis
	# that the data is normal if the calculated test statistic is less than
	# the critical value at a chosen significance level.