Last active
June 30, 2016 07:52
-
-
Save j-min/406a4209edec057b9253bfa936457075 to your computer and use it in GitHub Desktop.
R_custom_ANOVA&Summary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ANOVA_SUMMARY = function(formula, data) | |
{ | |
# 1. 데이터 전처리 | |
# 종속변수를 항상 첫번째 input으로 "y~." 형식으로 받음 | |
mf = model.frame(formula, data) | |
y = mf[,1] | |
Response_name = colnames(mf)[1] # 종속변수의 이름 | |
Variable_name = c("(intercept)", colnames(mf)[-1]) # 독립변수의 이름 | |
n = nrow(mf) | |
p = ncol(mf)-1 | |
cat("size of data (n):\t\t",n,"\nnumber of variables (p): \t",p,"\n\n") | |
# 종속변수 자리를 맨 왼쪽열로 데이터 재배치, 1열은 intercept(=1) 으로 변경 | |
X = as.matrix(data.frame("(Intercept)"=1, mf[,-1])) | |
# 2. 회귀계수 추정 | |
X_T_X_inv = solve(t(X) %*% X) # (X_T * X)^-1 | |
beta_hat = X_T_X_inv %*% t(X) %*% y # (X_T * X)^-1 * X_T * Y ; (X_T: X transpose) | |
y_hat = X %*% beta_hat # y의 추정값, y_hat 은 matrix type으로 출력 | |
y_bar = mean(y) | |
error = y - y_hat | |
regression = y_hat - y_bar | |
SSE = sum((error)^2) | |
SSR = sum((regression)^2) | |
SST = SSE + SSR | |
MSE = SSE / (n-p-1) | |
MSR = SSR / p | |
R_squared = SSR/SST | |
Adj_R_squared = 1-( (SSE/(n-p-1)) / (SST/(n-1)) ) | |
sigma_hat = SSE / (n-p-1) # Gauss-Markov Theorem | |
# 3. ANOVA | |
cat("1. ANOVA", "\n\n") | |
#아래와 같은 행/열을 가진 ANOVAtable data.frame 생성 | |
A_row_names = c("Regression", "Residuals", "Total") | |
A_col_names = c("Df", "Sum Sq", "Mean Sq", "F value", "Pr(>F)") | |
# SSR 자유도 = p / SSE 자유도 = n-p-1 / SST 자유도 = n-1 | |
Df = c(p, n-p-1, n-1) | |
Sum_Sq = round(c(SSR, SSE, SST)) | |
Mean_Sq = c(round(MSR), round(MSE), "") | |
F_value = c(round(MSR/MSE,3), "", "") | |
P_value_F = c(round(1-pf(MSR/MSE, df1 = p, df2 = n-p-1),6), "", "") | |
ANOVAtable = data.frame(Df, Sum_Sq, Mean_Sq, F_value, P_value_F) | |
rownames(ANOVAtable) = A_row_names | |
colnames(ANOVAtable) = A_col_names | |
cat("Analysis of Varinace Table", "\n\n") | |
cat("Response: ", Response_name, "\n") | |
print(ANOVAtable) | |
# 4. Summary (Residuals + Coefficients) | |
cat("\n\n2. Summary\n\n") | |
# 4-1. Residuals: | |
cat("Residuals:\n\n") | |
# summary 함수 구현 (기술통계량) | |
Min = min(error) | |
first_Q = quantile(error, probs = 0.25) | |
Median = median(error) | |
third_Q = quantile(error, probs = 0.75) | |
Max = max(error) | |
Residuals = round(data.frame(Min, first_Q, Median, third_Q, Max),3) | |
colnames(Residuals) = c("Min", "1Q", "Median", "3Q", "Max") | |
# n(표본의 개수) 가 10개 이하로 작으면 각 표본의 Residual도 그대로 출력 / 아니면 기술통계량만 출력 | |
if (n<10) { | |
Raw_Residual = round(as.data.frame(t(error)), 3) | |
#colnames(Raw_Residual) = seq(1, n) | |
print(Raw_Residual, row.names=FALSE) | |
cat("\n") | |
} | |
print(Residuals, row.names = FALSE) | |
#4-2. Coefficients | |
cat("\nCoefficients:\n\n") | |
#아래와 같은 행/열을 가진 Coefficient data.frame 생성 | |
C_row_names = c(Variable_name) | |
C_col_names = c("Estimate", "Std. Error", "t value", "Pr(>|t|)") | |
Estimate = round(beta_hat, 5) | |
Std_Error = round(sqrt(sigma_hat * diag(X_T_X_inv)), 5) # Gauss-Markov Thorem에 의해 beta_hat의 공분산 행렬 계산 | |
t_value = round(beta_hat / Std_Error ,3) # 검정통계량에서 beta_zero = 0으로 놓고 계산 | |
p_value = round(2 * (1 - pt(abs(t_value), df = n-p-1)) , 6) | |
Coefficients = data.frame(Estimate, Std_Error, t_value, p_value) | |
rownames(Coefficients) = C_row_names | |
colnames(Coefficients) = C_col_names | |
print(Coefficients) | |
cat("\nResidual standard error: ", round(sqrt(MSE),2), "on", n-p-1, "degrees of freedom\n") | |
cat("Multiple R-squared: ",round(R_squared,4),",\t ", | |
"Adjusted R-squared: ",round(Adj_R_squared,4),"\n") | |
cat("F-statistic: ",F_value[1], "on", p, "and", n-p-1, "DF,", | |
"p-value: \t", P_value_F[1]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment