Skip to content

Instantly share code, notes, and snippets.

@j-min
Last active June 30, 2016 07:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-min/406a4209edec057b9253bfa936457075 to your computer and use it in GitHub Desktop.
Save j-min/406a4209edec057b9253bfa936457075 to your computer and use it in GitHub Desktop.
R_custom_ANOVA&Summary
ANOVA_SUMMARY = function(formula, data)
{
# 1. 데이터 전처리
# 종속변수를 항상 첫번째 input으로 "y~." 형식으로 받음
mf = model.frame(formula, data)
y = mf[,1]
Response_name = colnames(mf)[1] # 종속변수의 이름
Variable_name = c("(intercept)", colnames(mf)[-1]) # 독립변수의 이름
n = nrow(mf)
p = ncol(mf)-1
cat("size of data (n):\t\t",n,"\nnumber of variables (p): \t",p,"\n\n")
# 종속변수 자리를 맨 왼쪽열로 데이터 재배치, 1열은 intercept(=1) 으로 변경
X = as.matrix(data.frame("(Intercept)"=1, mf[,-1]))
# 2. 회귀계수 추정
X_T_X_inv = solve(t(X) %*% X) # (X_T * X)^-1
beta_hat = X_T_X_inv %*% t(X) %*% y # (X_T * X)^-1 * X_T * Y ; (X_T: X transpose)
y_hat = X %*% beta_hat # y의 추정값, y_hat 은 matrix type으로 출력
y_bar = mean(y)
error = y - y_hat
regression = y_hat - y_bar
SSE = sum((error)^2)
SSR = sum((regression)^2)
SST = SSE + SSR
MSE = SSE / (n-p-1)
MSR = SSR / p
R_squared = SSR/SST
Adj_R_squared = 1-( (SSE/(n-p-1)) / (SST/(n-1)) )
sigma_hat = SSE / (n-p-1) # Gauss-Markov Theorem
# 3. ANOVA
cat("1. ANOVA", "\n\n")
#아래와 같은 행/열을 가진 ANOVAtable data.frame 생성
A_row_names = c("Regression", "Residuals", "Total")
A_col_names = c("Df", "Sum Sq", "Mean Sq", "F value", "Pr(>F)")
# SSR 자유도 = p / SSE 자유도 = n-p-1 / SST 자유도 = n-1
Df = c(p, n-p-1, n-1)
Sum_Sq = round(c(SSR, SSE, SST))
Mean_Sq = c(round(MSR), round(MSE), "")
F_value = c(round(MSR/MSE,3), "", "")
P_value_F = c(round(1-pf(MSR/MSE, df1 = p, df2 = n-p-1),6), "", "")
ANOVAtable = data.frame(Df, Sum_Sq, Mean_Sq, F_value, P_value_F)
rownames(ANOVAtable) = A_row_names
colnames(ANOVAtable) = A_col_names
cat("Analysis of Varinace Table", "\n\n")
cat("Response: ", Response_name, "\n")
print(ANOVAtable)
# 4. Summary (Residuals + Coefficients)
cat("\n\n2. Summary\n\n")
# 4-1. Residuals:
cat("Residuals:\n\n")
# summary 함수 구현 (기술통계량)
Min = min(error)
first_Q = quantile(error, probs = 0.25)
Median = median(error)
third_Q = quantile(error, probs = 0.75)
Max = max(error)
Residuals = round(data.frame(Min, first_Q, Median, third_Q, Max),3)
colnames(Residuals) = c("Min", "1Q", "Median", "3Q", "Max")
# n(표본의 개수) 가 10개 이하로 작으면 각 표본의 Residual도 그대로 출력 / 아니면 기술통계량만 출력
if (n<10) {
Raw_Residual = round(as.data.frame(t(error)), 3)
#colnames(Raw_Residual) = seq(1, n)
print(Raw_Residual, row.names=FALSE)
cat("\n")
}
print(Residuals, row.names = FALSE)
#4-2. Coefficients
cat("\nCoefficients:\n\n")
#아래와 같은 행/열을 가진 Coefficient data.frame 생성
C_row_names = c(Variable_name)
C_col_names = c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
Estimate = round(beta_hat, 5)
Std_Error = round(sqrt(sigma_hat * diag(X_T_X_inv)), 5) # Gauss-Markov Thorem에 의해 beta_hat의 공분산 행렬 계산
t_value = round(beta_hat / Std_Error ,3) # 검정통계량에서 beta_zero = 0으로 놓고 계산
p_value = round(2 * (1 - pt(abs(t_value), df = n-p-1)) , 6)
Coefficients = data.frame(Estimate, Std_Error, t_value, p_value)
rownames(Coefficients) = C_row_names
colnames(Coefficients) = C_col_names
print(Coefficients)
cat("\nResidual standard error: ", round(sqrt(MSE),2), "on", n-p-1, "degrees of freedom\n")
cat("Multiple R-squared: ",round(R_squared,4),",\t ",
"Adjusted R-squared: ",round(Adj_R_squared,4),"\n")
cat("F-statistic: ",F_value[1], "on", p, "and", n-p-1, "DF,",
"p-value: \t", P_value_F[1])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment