Skip to content

Instantly share code, notes, and snippets.

@lundquist-ecology-lab
Last active January 15, 2023 01:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lundquist-ecology-lab/a5c65393e0c49beeb23c9ce572646a93 to your computer and use it in GitHub Desktop.
Save lundquist-ecology-lab/a5c65393e0c49beeb23c9ce572646a93 to your computer and use it in GitHub Desktop.
Analysis of variance (ANOVA) using the Iris data set in Python
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import load_iris
from bioinfokit.analys import stat
from statsmodels.formula.api import ols
# Y needs to be continuous and needs to be organized into > two X groups.
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].apply(lambda x: iris.target_names[x])
print(df)
# Rename the column of interest so it does not have spaces
df = df.rename(columns={'petal width (cm)': 'petal_width'})
ax = sns.boxplot(x='species', y='petal_width', data=df, color="#99c2a2")
ax = sns.stripplot(x='species', y='petal_width', data=df, color="#7d0013")
plt.show()
## Interested in petal width
setosa = df.loc[df['species'] == "setosa"]
versicolor = df.loc[df['species'] == "versicolor"]
virginica = df.loc[df['species'] == "virginica"]
# For just F and P value
fvalue, pvalue = stats.f_oneway(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
print(fvalue, pvalue)
## ANOVA table
# Ordinary Least Squares model
model = ols('petal_width ~ species', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
# Tukey HSD
res = stat()
res.tukey_hsd(df=df, res_var='petal_width', xfac_var='species', anova_model='petal_width ~ C(species)')
res.tukey_summary
# Check assumptions
# QQ-plot
import statsmodels.api as sm
import matplotlib.pyplot as plt
sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()
# histogram
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k')
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()
# Shapiro-Wilk test for normality
w, pvalue = stats.shapiro(model.resid)
print(w, pvalue)
# Bartlett test for homogeneity of variances
w, pvalue = stats.bartlett(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
print(w, pvalue)
## Levene's test for homogeneity of variances
res = stat()
res.levene(df=df, res_var='petal_width', xfac_var='species')
res.levene_summary
# It seems that we should be using the non-parametric ANOVA (Kruskal-Wallace)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment