Skip to content

Instantly share code, notes, and snippets.

@marnunez
Last active February 14, 2020 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save marnunez/6c68248fee14236dab42ef3e46b94116 to your computer and use it in GitHub Desktop.
Save marnunez/6c68248fee14236dab42ef3e46b94116 to your computer and use it in GitHub Desktop.
Normality assessment tools
from scipy.stats import shapiro, normaltest, anderson, norm
from statsmodels.graphics.gofplots import qqplot
import seaborn as sns
from colorama import Fore
import matplotlib.pyplot as plt
import numpy as np
sns.set(color_codes=True)
def check_shapiro(data, alpha=0.05):
""" Check Shapiro-Wilk test. If p > alpha, reject H0 => it's Gaussian """
stat, p = shapiro(data)
return True if p > alpha else False
def check_dagostino(data, alpha=0.05):
""" Check D’Agostino K^2 test. If p > alpha, reject H0 => it's Gaussian """
stat, p = normaltest(data)
return True if p > alpha else False
def check_anderson(data):
""" Check Anderson-Darling test for normal distribution.
Returns True if it can't reject H0 to 15%, 10% or 5%. => it's Gaussian
Returns False otherwise
"""
result = anderson(data, dist="norm")
return next(
(
True
for sig_value, crit_value in zip(
result.significance_level[0:3], result.critical_values[0:3]
)
if result.statistic < crit_value
),
False,
)
def is_normal(data):
""" Check for normality.
Returns True if 2 out of 3 statistical tests result in normality
Returns False otherwise
"""
return (
True
if [check_shapiro(data), check_dagostino(data), check_anderson(data)].count(True) >= 2
else False
)
def analyze_normality(df):
""" Visual aid for normality check.
For every numeric column in df:
- distribution plot with kde and fit to normal
- qq plot
- results of all three statistical tests
"""
si = Fore.GREEN + "NORMAL" + Fore.RESET
no = Fore.RED + "NO NORMAL" + Fore.RESET
for column in [i for i in df.columns if df[i].dtype.kind in 'biufc']:
data = df[column].dropna()
print(f"{column}: {len(data)} puntos")
fig, ax = plt.subplots(1, 2, figsize=(7, 5))
sns.distplot(data, ax=ax[0], fit=norm)
qqplot(data, line="s", ax=ax[1])
plt.show()
print(f"\tSHAPIRO:\t{si if check_shapiro(data,0.05) else no}")
print(f"\tD'AGOSTINO:\t{si if check_dagostino(data,0.05) else no}")
print(f"\tANDERSON:\t{si if check_anderson(data) else no}")
print("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment