Skip to content

Instantly share code, notes, and snippets.

@doraneko94
Created July 5, 2023 07:50
Show Gist options
  • Save doraneko94/04653c0e75cffd93a2d0b21bbe34a548 to your computer and use it in GitHub Desktop.
Save doraneko94/04653c0e75cffd93a2d0b21bbe34a548 to your computer and use it in GitHub Desktop.
Python Class for repeated measures two-way ANOVA.
import numpy as np
import pandas as pd
from scipy import stats
class TwoWayAnovaRM:
def __init__(self, data: pd.DataFrame, subject, samples: list):
self.data = data
self.subject = subject
self.samples = samples
self.subject_list = data[subject].unique()
self.n_subject = len(self.subject_list)
self.n_samples = len(samples)
self.n_col = len(data)
self.results_variance = None
self.results_all = None
self.results_double = None
def fit(self):
# Variance for All
count_all = self.data[self.samples].count().values.sum()
sum_all = self.data[self.samples].values.sum()
mean_all = sum_all / count_all
var_all = np.square(self.data[self.samples] - mean_all).values.sum() / (count_all - 1)
std_all = np.sqrt(var_all)
se_all = std_all / np.sqrt(count_all)
# Variance for Subject
count_sub = self.data.groupby(self.subject).count()[self.samples].sum(axis=1).values
sum_sub = self.data.groupby(self.subject).sum()[self.samples].sum(axis=1).values
mean_sub = sum_sub / count_sub
var_sub = np.array([np.square(self.data[self.data[self.subject] == sub][self.samples].values - mean_sub[i]).sum() / (count_sub[i] - 1) for i, sub in enumerate(self.subject_list)])
std_sub = np.sqrt(var_sub)
se_sub = std_sub / np.sqrt(count_sub)
# Variance for Samples
count_sam = self.data[self.samples].count().values
sum_sam = self.data[self.samples].sum().values
mean_sam = sum_sam / count_sam
var_sam = self.data[self.samples].var().values
std_sam = np.sqrt(var_sam)
se_sam = std_sam / np.sqrt(count_sam)
# Variance for Subject x Samples
count_cross = np.hstack(self.data.groupby(self.subject).count().values)
sum_cross = np.hstack(self.data.groupby(self.subject).sum().values)
mean_cross = sum_cross / count_cross
var_cross = np.hstack(self.data.groupby(self.subject).var().values)
std_cross = np.sqrt(var_cross)
se_cross = std_cross / np.sqrt(count_cross)
# Columns
count_col = self.data[self.samples].count(axis=1)
mean_col = self.data[self.samples].mean(axis=1)
# Table of results
ssd = [np.nan for _ in range(6)]
free = [np.nan for _ in range(6)]
fval = [np.nan for _ in range(6)]
pval = [np.nan for _ in range(6)]
p095 = [np.nan for _ in range(6)]
# Calculate results
ssd[0] = np.square((self.data[self.samples] - mean_all).values).sum()
free[0] = count_all - 1
ssd[1] = (mean_sub * sum_sub).sum() - mean_all * sum_all
free[1] = self.n_subject - 1
ssd[2] = np.sum([(count_col[self.data[self.subject]==s] * np.square(mean_col[self.data[self.subject]==s] - mean_sub[i])).sum() for i, s in enumerate(self.subject_list)])
free[2] = (self.data.groupby(self.subject).count().max(axis=1).values - 1).sum()
ssd[3] = (mean_sam * sum_sam).sum() - mean_all * sum_all
free[3] = self.n_samples - 1
S = np.sum([np.square(self.data[self.data[self.subject]==sub][sam] - mean_cross[i*self.n_samples+j]).sum() for i, sub in enumerate(self.subject_list) for j, sam in enumerate(self.samples)])
ssd[4] = ssd[0] - ssd[1] - ssd[3] - S
ssd[5] = S - ssd[2]
free[4] = (self.n_subject - 1) * (self.n_samples - 1)
free[5] = count_all - self.n_subject * self.n_samples - free[2]
ms = [ssd[i] / free[i] for i in range(6)]
ms[0] = np.nan
fval[1] = ms[1] / ms[2]
fval[3] = ms[3] / ms[5]
fval[4] = ms[4] / ms[5]
pval[1] = 1 - stats.f.cdf(fval[1], free[1], free[2])
pval[3] = 1 - stats.f.cdf(fval[3], free[1], free[5])
pval[4] = 1 - stats.f.cdf(fval[4], free[1], free[5])
p095[1] = stats.f.ppf(0.95, free[1], free[2])
p095[3] = stats.f.ppf(0.95, free[3], free[5])
p095[4] = stats.f.ppf(0.95, free[4], free[5])
# Save results
self.results_variance = pd.DataFrame({
"Factor": ["All", "INTER_SUB", "INTER_EXP", "INTRA_SUB", "INTERACTION", "ERROR"],
"SUM_SQU_DEV": ssd, "DEG_FREE": free, "MEAN_SQU": ms,
"F_VALUE": fval, "P_VALUE": pval, "F(0.95)": p095
})
self.results_all = pd.DataFrame(
[[count_all, mean_all, var_all, std_all, se_all]],
index=["ALL_DATA"], columns=["DATA_NUM", "MEAN", "UNBIASED_VAR", "STD_DEV", "STD_ERR"],
)
self.results_double = pd.DataFrame()
self.results_double["SUBJECT"] = np.hstack(np.array([self.subject_list for _ in range(self.n_samples)]).T)
self.results_double["SAMPLE"] = np.hstack([self.samples for _ in range(self.n_subject)])
self.results_double["DATA_NUM"] = count_cross
self.results_double["MEAN"] = mean_cross
self.results_double["UNBIASED_VAR"] = var_cross
self.results_double["STD_DEV"] = std_cross
self.results_double["STD_ERR"] = se_cross
if __name__ == "__main__":
# Example from Japanese book - 4Steps エクセル統計【第4版】
df = pd.DataFrame()
df["Rat"] = ["A", "A", "B", "B", "A", "B", "A", "A", "B"]
df["1w"] = [1.0, 2.0, 2.0, 1.0, 1.0, 1.8, 1.5, 1.5, 1.0]
df["2w"] = [2.0, 5.0, 3.0, 1.0, 1.5, 1.5, 3.0, 1.2, 1.6]
df["3w"] = [3.0, 8.0, 2.0, 0.8, 8.0, 1.4, 5.0, 1.8, 3.0]
model = TwoWayAnovaRM(data=df, subject="Rat", samples=["1w", "2w", "3w"])
model.fit()
print(model.results_variance)
# Factor SUM_SQU_DEV DEG_FREE MEAN_SQU F_VALUE P_VALUE F(0.95)
# 0 All 97.696296 26 NaN NaN NaN NaN
# 1 INTER_SUB 12.300463 1 12.300463 3.355216 0.109668 5.591448
# 2 INTER_EXP 25.662500 7 3.666071 NaN NaN NaN
# 3 INTRA_SUB 23.380741 2 11.690370 7.352765 0.016869 3.738892
# 4 INTERACTION 14.093593 2 7.046796 4.432146 0.053806 3.738892
# 5 ERROR 22.259000 14 1.589929 NaN NaN NaN
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment