Last active
January 15, 2023 01:20
-
-
Save lundquist-ecology-lab/a5c65393e0c49beeb23c9ce572646a93 to your computer and use it in GitHub Desktop.
Analysis of variance (ANOVA) using the Iris data set in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import scipy.stats as stats | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import statsmodels.api as sm | |
from sklearn.datasets import load_iris | |
from bioinfokit.analys import stat | |
from statsmodels.formula.api import ols | |
# Y needs to be continuous and needs to be organized into > two X groups. | |
iris = load_iris() | |
df = pd.DataFrame(data=iris.data, columns=iris.feature_names) | |
df['target'] = iris.target | |
df['species'] = df['target'].apply(lambda x: iris.target_names[x]) | |
print(df) | |
# Rename the column of interest so it does not have spaces | |
df = df.rename(columns={'petal width (cm)': 'petal_width'}) | |
ax = sns.boxplot(x='species', y='petal_width', data=df, color="#99c2a2") | |
ax = sns.stripplot(x='species', y='petal_width', data=df, color="#7d0013") | |
plt.show() | |
## Interested in petal width | |
setosa = df.loc[df['species'] == "setosa"] | |
versicolor = df.loc[df['species'] == "versicolor"] | |
virginica = df.loc[df['species'] == "virginica"] | |
# For just F and P value | |
fvalue, pvalue = stats.f_oneway(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width']) | |
print(fvalue, pvalue) | |
## ANOVA table | |
# Ordinary Least Squares model | |
model = ols('petal_width ~ species', data=df).fit() | |
anova_table = sm.stats.anova_lm(model, typ=2) | |
anova_table | |
# Tukey HSD | |
res = stat() | |
res.tukey_hsd(df=df, res_var='petal_width', xfac_var='species', anova_model='petal_width ~ C(species)') | |
res.tukey_summary | |
# Check assumptions | |
# QQ-plot | |
import statsmodels.api as sm | |
import matplotlib.pyplot as plt | |
sm.qqplot(res.anova_std_residuals, line='45') | |
plt.xlabel("Theoretical Quantiles") | |
plt.ylabel("Standardized Residuals") | |
plt.show() | |
# histogram | |
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k') | |
plt.xlabel("Residuals") | |
plt.ylabel('Frequency') | |
plt.show() | |
# Shapiro-Wilk test for normality | |
w, pvalue = stats.shapiro(model.resid) | |
print(w, pvalue) | |
# Bartlett test for homogeneity of variances | |
w, pvalue = stats.bartlett(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width']) | |
print(w, pvalue) | |
## Levene's test for homogeneity of variances | |
res = stat() | |
res.levene(df=df, res_var='petal_width', xfac_var='species') | |
res.levene_summary | |
# It seems that we should be using the non-parametric ANOVA (Kruskal-Wallace) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment