Linear regression implementation using linear algebra in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Linear Regression Using lm() ---------------------------------------- | |
data("swiss") | |
dat <- swiss | |
linear_model <- lm(Fertility ~ ., data = dat) | |
summary(linear_model) | |
# Call: | |
# lm(formula = Fertility ~ ., data = dat) | |
# | |
# Residuals: | |
# Min 1Q Median 3Q Max | |
# -15.2743 -5.2617 0.5032 4.1198 15.3213 | |
# | |
# Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 66.91518 10.70604 6.250 1.91e-07 *** | |
# Agriculture -0.17211 0.07030 -2.448 0.01873 * | |
# Examination -0.25801 0.25388 -1.016 0.31546 | |
# Education -0.87094 0.18303 -4.758 2.43e-05 *** | |
# Catholic 0.10412 0.03526 2.953 0.00519 ** | |
# Infant.Mortality 1.07705 0.38172 2.822 0.00734 ** | |
# --- | |
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | |
# | |
# Residual standard error: 7.165 on 41 degrees of freedom | |
# Multiple R-squared: 0.7067, Adjusted R-squared: 0.671 | |
# F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10 | |
### Using Linear Algebra ------------------------------------------------ | |
y <- matrix(dat$Fertility, nrow = nrow(dat)) | |
X <- cbind(1, as.matrix(x = dat[,-1])) | |
colnames(X)[1] <- "(Intercept)" | |
# N x k matrix | |
N <- nrow(X) | |
k <- ncol(X) - 1 # number of predictor variables (ergo, excluding Intercept column) | |
# Estimated Regression Coefficients | |
beta_hat <- solve(t(X)%*%X)%*%(t(X)%*%y) | |
# Variance of outcome variable = Variance of residuals | |
sigma_sq <- residual_variance <- (N-k-1)^-1 * sum((y - X %*% beta_hat)^2) | |
residual_std_error <- sqrt(residual_variance) | |
# Variance and Std. Error of estimated coefficients of the linear model | |
var_betaHat <- sigma_sq * solve(t(X) %*% X) | |
coeff_std_errors <- sqrt(diag(var_betaHat)) | |
# t values of estimates are ratio of estimated coefficients to std. errors | |
t_values <- beta_hat / coeff_std_errors | |
# p-values of t-statistics of estimated coefficeints | |
p_values_tstat <- 2 * pt(abs(t_values), N-k, lower.tail = FALSE) | |
# assigning R's significance codes to obtained p-values | |
signif_codes_match <- function(x){ | |
ifelse(x <= 0.001,"***", | |
ifelse(x <= 0.01,"**", | |
ifelse(x < 0.05,"*", | |
ifelse(x < 0.1,"."," ")))) | |
} | |
signif_codes <- sapply(p_values_tstat, signif_codes_match) | |
# R-squared and Adjusted R-squared (refer any econometrics / statistics textbook) | |
R_sq <- 1 - (N-k-1)*residual_variance / (N*mean((y - mean(y))^2)) | |
R_sq_adj <- 1 - residual_variance / ((N/(N-1))*mean((y - mean(y))^2)) | |
# Residual sum of squares (RSS) for the full model | |
RSS <- (N-k-1)*residual_variance | |
# RSS for the partial model with only intercept (equal to mean), ergo, TSS | |
RSS0 <- TSS <- sum((y - mean(y))^2) | |
# F statistic based on RSS for full and partial models | |
# k = degress of freedom of partial model | |
# N - k - 1 = degress of freedom of full model | |
F_stat <- ((RSS0 - RSS)/k) / (RSS/(N-k-1)) | |
# p-values of the F statistic | |
p_value_F_stat <- pf(F_stat, df1 = k, df2 = N-k-1, lower.tail = FALSE) | |
# stitch the main results toghether | |
lm_results <- as.data.frame(cbind(beta_hat, coeff_std_errors, | |
t_values, p_values_tstat, signif_codes)) | |
colnames(lm_results) <- c("Estimate","Std. Error","t value","Pr(>|t|)","") | |
### Print out results of all relevant calcualtions ----------------------- | |
print(lm_results) | |
cat("Residual standard error: ", | |
round(residual_std_error, digits = 3), | |
" on ",N-k-1," degrees of freedom", | |
"\nMultiple R-squared: ",R_sq," Adjusted R-squared: ",R_sq_adj, | |
"\nF-statistic: ",F_stat, " on ",k-1," and ",N-k-1, | |
" DF, p-value: ", p_value_F_stat,"\n") | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 66.9151816789654 10.7060375853301 6.25022854119771 1.73336561301153e-07 *** | |
# Agriculture -0.172113970941457 0.0703039231786469 -2.44814177018405 0.0186186100433133 * | |
# Examination -0.258008239834722 0.253878200892098 -1.01626779663678 0.315320687313066 | |
# Education -0.870940062939429 0.183028601571259 -4.75849159892283 2.3228265226988e-05 *** | |
# Catholic 0.104115330743766 0.035257852536169 2.95296858017545 0.00513556154915653 ** | |
# Infant.Mortality 1.07704814069103 0.381719650858061 2.82156849475775 0.00726899472564356 ** | |
# Residual standard error: 7.165 on 41 degrees of freedom | |
# Multiple R-squared: 0.706735 Adjusted R-squared: 0.670971 | |
# F-statistic: 19.76106 on 4 and 41 DF, p-value: 5.593799e-10 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment