lundquist-ecology-lab/anova.py

## anova.py
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.datasets import load_iris
from bioinfokit.analys import stat
from statsmodels.formula.api import ols

# Y needs to be continuous and needs to be organized into > two X groups.
iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].apply(lambda x: iris.target_names[x])

print(df)

# Rename the column of interest so it does not have spaces
df = df.rename(columns={'petal width (cm)': 'petal_width'})


ax = sns.boxplot(x='species', y='petal_width', data=df, color="#99c2a2")
ax = sns.stripplot(x='species', y='petal_width', data=df, color="#7d0013")
plt.show()

## Interested in petal width

setosa = df.loc[df['species'] == "setosa"]
versicolor = df.loc[df['species'] == "versicolor"]
virginica = df.loc[df['species'] == "virginica"]


# For just F and P value
fvalue, pvalue = stats.f_oneway(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
print(fvalue, pvalue)


## ANOVA table
# Ordinary Least Squares model
model = ols('petal_width ~ species', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

# Tukey HSD

res = stat()
res.tukey_hsd(df=df, res_var='petal_width', xfac_var='species', anova_model='petal_width ~ C(species)')
res.tukey_summary

# Check assumptions
# QQ-plot
import statsmodels.api as sm
import matplotlib.pyplot as plt

sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()

# histogram
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k')
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()

# Shapiro-Wilk test for normality
w, pvalue = stats.shapiro(model.resid)
print(w, pvalue)

# Bartlett test for homogeneity of variances
w, pvalue = stats.bartlett(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
print(w, pvalue)

## Levene's test for homogeneity of variances
res = stat()
res.levene(df=df, res_var='petal_width', xfac_var='species')
res.levene_summary

# It seems that we should be using the non-parametric ANOVA (Kruskal-Wallace)
	import pandas as pd
	import scipy.stats as stats
	import matplotlib.pyplot as plt
	import seaborn as sns
	import statsmodels.api as sm

	from sklearn.datasets import load_iris
	from bioinfokit.analys import stat
	from statsmodels.formula.api import ols

	# Y needs to be continuous and needs to be organized into > two X groups.
	iris = load_iris()

	df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
	df['target'] = iris.target
	df['species'] = df['target'].apply(lambda x: iris.target_names[x])

	print(df)

	# Rename the column of interest so it does not have spaces
	df = df.rename(columns={'petal width (cm)': 'petal_width'})


	ax = sns.boxplot(x='species', y='petal_width', data=df, color="#99c2a2")
	ax = sns.stripplot(x='species', y='petal_width', data=df, color="#7d0013")
	plt.show()

	## Interested in petal width

	setosa = df.loc[df['species'] == "setosa"]
	versicolor = df.loc[df['species'] == "versicolor"]
	virginica = df.loc[df['species'] == "virginica"]


	# For just F and P value
	fvalue, pvalue = stats.f_oneway(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
	print(fvalue, pvalue)


	## ANOVA table
	# Ordinary Least Squares model
	model = ols('petal_width ~ species', data=df).fit()
	anova_table = sm.stats.anova_lm(model, typ=2)
	anova_table

	# Tukey HSD

	res = stat()
	res.tukey_hsd(df=df, res_var='petal_width', xfac_var='species', anova_model='petal_width ~ C(species)')
	res.tukey_summary

	# Check assumptions
	# QQ-plot
	import statsmodels.api as sm
	import matplotlib.pyplot as plt

	sm.qqplot(res.anova_std_residuals, line='45')
	plt.xlabel("Theoretical Quantiles")
	plt.ylabel("Standardized Residuals")
	plt.show()

	# histogram
	plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k')
	plt.xlabel("Residuals")
	plt.ylabel('Frequency')
	plt.show()

	# Shapiro-Wilk test for normality
	w, pvalue = stats.shapiro(model.resid)
	print(w, pvalue)

	# Bartlett test for homogeneity of variances
	w, pvalue = stats.bartlett(setosa['petal_width'], versicolor['petal_width'], virginica['petal_width'])
	print(w, pvalue)

	## Levene's test for homogeneity of variances
	res = stat()
	res.levene(df=df, res_var='petal_width', xfac_var='species')
	res.levene_summary

	# It seems that we should be using the non-parametric ANOVA (Kruskal-Wallace)