Created
July 5, 2023 07:50
-
-
Save doraneko94/04653c0e75cffd93a2d0b21bbe34a548 to your computer and use it in GitHub Desktop.
Python Class for repeated measures two-way ANOVA.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from scipy import stats | |
class TwoWayAnovaRM: | |
def __init__(self, data: pd.DataFrame, subject, samples: list): | |
self.data = data | |
self.subject = subject | |
self.samples = samples | |
self.subject_list = data[subject].unique() | |
self.n_subject = len(self.subject_list) | |
self.n_samples = len(samples) | |
self.n_col = len(data) | |
self.results_variance = None | |
self.results_all = None | |
self.results_double = None | |
def fit(self): | |
# Variance for All | |
count_all = self.data[self.samples].count().values.sum() | |
sum_all = self.data[self.samples].values.sum() | |
mean_all = sum_all / count_all | |
var_all = np.square(self.data[self.samples] - mean_all).values.sum() / (count_all - 1) | |
std_all = np.sqrt(var_all) | |
se_all = std_all / np.sqrt(count_all) | |
# Variance for Subject | |
count_sub = self.data.groupby(self.subject).count()[self.samples].sum(axis=1).values | |
sum_sub = self.data.groupby(self.subject).sum()[self.samples].sum(axis=1).values | |
mean_sub = sum_sub / count_sub | |
var_sub = np.array([np.square(self.data[self.data[self.subject] == sub][self.samples].values - mean_sub[i]).sum() / (count_sub[i] - 1) for i, sub in enumerate(self.subject_list)]) | |
std_sub = np.sqrt(var_sub) | |
se_sub = std_sub / np.sqrt(count_sub) | |
# Variance for Samples | |
count_sam = self.data[self.samples].count().values | |
sum_sam = self.data[self.samples].sum().values | |
mean_sam = sum_sam / count_sam | |
var_sam = self.data[self.samples].var().values | |
std_sam = np.sqrt(var_sam) | |
se_sam = std_sam / np.sqrt(count_sam) | |
# Variance for Subject x Samples | |
count_cross = np.hstack(self.data.groupby(self.subject).count().values) | |
sum_cross = np.hstack(self.data.groupby(self.subject).sum().values) | |
mean_cross = sum_cross / count_cross | |
var_cross = np.hstack(self.data.groupby(self.subject).var().values) | |
std_cross = np.sqrt(var_cross) | |
se_cross = std_cross / np.sqrt(count_cross) | |
# Columns | |
count_col = self.data[self.samples].count(axis=1) | |
mean_col = self.data[self.samples].mean(axis=1) | |
# Table of results | |
ssd = [np.nan for _ in range(6)] | |
free = [np.nan for _ in range(6)] | |
fval = [np.nan for _ in range(6)] | |
pval = [np.nan for _ in range(6)] | |
p095 = [np.nan for _ in range(6)] | |
# Calculate results | |
ssd[0] = np.square((self.data[self.samples] - mean_all).values).sum() | |
free[0] = count_all - 1 | |
ssd[1] = (mean_sub * sum_sub).sum() - mean_all * sum_all | |
free[1] = self.n_subject - 1 | |
ssd[2] = np.sum([(count_col[self.data[self.subject]==s] * np.square(mean_col[self.data[self.subject]==s] - mean_sub[i])).sum() for i, s in enumerate(self.subject_list)]) | |
free[2] = (self.data.groupby(self.subject).count().max(axis=1).values - 1).sum() | |
ssd[3] = (mean_sam * sum_sam).sum() - mean_all * sum_all | |
free[3] = self.n_samples - 1 | |
S = np.sum([np.square(self.data[self.data[self.subject]==sub][sam] - mean_cross[i*self.n_samples+j]).sum() for i, sub in enumerate(self.subject_list) for j, sam in enumerate(self.samples)]) | |
ssd[4] = ssd[0] - ssd[1] - ssd[3] - S | |
ssd[5] = S - ssd[2] | |
free[4] = (self.n_subject - 1) * (self.n_samples - 1) | |
free[5] = count_all - self.n_subject * self.n_samples - free[2] | |
ms = [ssd[i] / free[i] for i in range(6)] | |
ms[0] = np.nan | |
fval[1] = ms[1] / ms[2] | |
fval[3] = ms[3] / ms[5] | |
fval[4] = ms[4] / ms[5] | |
pval[1] = 1 - stats.f.cdf(fval[1], free[1], free[2]) | |
pval[3] = 1 - stats.f.cdf(fval[3], free[1], free[5]) | |
pval[4] = 1 - stats.f.cdf(fval[4], free[1], free[5]) | |
p095[1] = stats.f.ppf(0.95, free[1], free[2]) | |
p095[3] = stats.f.ppf(0.95, free[3], free[5]) | |
p095[4] = stats.f.ppf(0.95, free[4], free[5]) | |
# Save results | |
self.results_variance = pd.DataFrame({ | |
"Factor": ["All", "INTER_SUB", "INTER_EXP", "INTRA_SUB", "INTERACTION", "ERROR"], | |
"SUM_SQU_DEV": ssd, "DEG_FREE": free, "MEAN_SQU": ms, | |
"F_VALUE": fval, "P_VALUE": pval, "F(0.95)": p095 | |
}) | |
self.results_all = pd.DataFrame( | |
[[count_all, mean_all, var_all, std_all, se_all]], | |
index=["ALL_DATA"], columns=["DATA_NUM", "MEAN", "UNBIASED_VAR", "STD_DEV", "STD_ERR"], | |
) | |
self.results_double = pd.DataFrame() | |
self.results_double["SUBJECT"] = np.hstack(np.array([self.subject_list for _ in range(self.n_samples)]).T) | |
self.results_double["SAMPLE"] = np.hstack([self.samples for _ in range(self.n_subject)]) | |
self.results_double["DATA_NUM"] = count_cross | |
self.results_double["MEAN"] = mean_cross | |
self.results_double["UNBIASED_VAR"] = var_cross | |
self.results_double["STD_DEV"] = std_cross | |
self.results_double["STD_ERR"] = se_cross | |
if __name__ == "__main__": | |
# Example from Japanese book - 4Steps エクセル統計【第4版】 | |
df = pd.DataFrame() | |
df["Rat"] = ["A", "A", "B", "B", "A", "B", "A", "A", "B"] | |
df["1w"] = [1.0, 2.0, 2.0, 1.0, 1.0, 1.8, 1.5, 1.5, 1.0] | |
df["2w"] = [2.0, 5.0, 3.0, 1.0, 1.5, 1.5, 3.0, 1.2, 1.6] | |
df["3w"] = [3.0, 8.0, 2.0, 0.8, 8.0, 1.4, 5.0, 1.8, 3.0] | |
model = TwoWayAnovaRM(data=df, subject="Rat", samples=["1w", "2w", "3w"]) | |
model.fit() | |
print(model.results_variance) | |
# Factor SUM_SQU_DEV DEG_FREE MEAN_SQU F_VALUE P_VALUE F(0.95) | |
# 0 All 97.696296 26 NaN NaN NaN NaN | |
# 1 INTER_SUB 12.300463 1 12.300463 3.355216 0.109668 5.591448 | |
# 2 INTER_EXP 25.662500 7 3.666071 NaN NaN NaN | |
# 3 INTRA_SUB 23.380741 2 11.690370 7.352765 0.016869 3.738892 | |
# 4 INTERACTION 14.093593 2 7.046796 4.432146 0.053806 3.738892 | |
# 5 ERROR 22.259000 14 1.589929 NaN NaN NaN |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment