anirudhjayaraman/lm_linear_algebra.R

## lm_linear_algebra.R
### Linear Regression Using lm() ----------------------------------------
data("swiss")
dat <- swiss
linear_model <- lm(Fertility ~ ., data = dat)
summary(linear_model)


# Call:
# lm(formula = Fertility ~ ., data = dat)
#
# Residuals:
#      Min       1Q   Median       3Q      Max
# -15.2743  -5.2617   0.5032   4.1198  15.3213
#
# Coefficients:
#                  Estimate Std. Error t value Pr(>|t|)
# (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
# Agriculture      -0.17211    0.07030  -2.448  0.01873 *
# Examination      -0.25801    0.25388  -1.016  0.31546
# Education        -0.87094    0.18303  -4.758 2.43e-05 ***
# Catholic          0.10412    0.03526   2.953  0.00519 **
# Infant.Mortality  1.07705    0.38172   2.822  0.00734 **
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 7.165 on 41 degrees of freedom
# Multiple R-squared:  0.7067,	Adjusted R-squared:  0.671
# F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10


### Using Linear Algebra ------------------------------------------------

y <- matrix(dat$Fertility, nrow = nrow(dat))
X <- cbind(1, as.matrix(x = dat[,-1]))
colnames(X)[1] <- "(Intercept)"

# N x k matrix
N <- nrow(X)
k <- ncol(X) - 1 # number of predictor variables (ergo, excluding Intercept column)

# Estimated Regression Coefficients
beta_hat <- solve(t(X)%*%X)%*%(t(X)%*%y)

# Variance of outcome variable = Variance of residuals
sigma_sq <- residual_variance <- (N-k-1)^-1 * sum((y - X %*% beta_hat)^2)
residual_std_error <- sqrt(residual_variance)

# Variance and Std. Error of estimated coefficients of the linear model
var_betaHat <- sigma_sq * solve(t(X) %*% X)
coeff_std_errors <- sqrt(diag(var_betaHat))

# t values of estimates are ratio of estimated coefficients to std. errors
t_values <- beta_hat / coeff_std_errors

# p-values of t-statistics of estimated coefficeints
p_values_tstat <- 2 * pt(abs(t_values), N-k, lower.tail = FALSE)

# assigning R's significance codes to obtained p-values
signif_codes_match <- function(x){
  ifelse(x <= 0.001,"***",
         ifelse(x <= 0.01,"**",
                ifelse(x < 0.05,"*",
                       ifelse(x < 0.1,"."," "))))

}
signif_codes <- sapply(p_values_tstat, signif_codes_match)

# R-squared and Adjusted R-squared (refer any econometrics / statistics textbook)
R_sq <- 1 - (N-k-1)*residual_variance / (N*mean((y - mean(y))^2))
R_sq_adj <- 1 - residual_variance / ((N/(N-1))*mean((y - mean(y))^2))

# Residual sum of squares (RSS) for the full model
RSS <- (N-k-1)*residual_variance
# RSS for the partial model with only intercept (equal to mean), ergo, TSS
RSS0 <- TSS <- sum((y - mean(y))^2)

# F statistic based on RSS for full and partial models
# k = degress of freedom of partial model
# N - k - 1 = degress of freedom of full model
F_stat <- ((RSS0 - RSS)/k) / (RSS/(N-k-1))

# p-values of the F statistic
p_value_F_stat <- pf(F_stat, df1 = k, df2 = N-k-1, lower.tail = FALSE)

# stitch the main results toghether
lm_results <- as.data.frame(cbind(beta_hat, coeff_std_errors,
                                  t_values, p_values_tstat, signif_codes))
colnames(lm_results) <- c("Estimate","Std. Error","t value","Pr(>|t|)","")


### Print out results of all relevant calcualtions -----------------------


print(lm_results)
cat("Residual standard error: ",
          round(residual_std_error, digits = 3),
          " on ",N-k-1," degrees of freedom",
          "\nMultiple R-squared: ",R_sq," Adjusted R-squared: ",R_sq_adj,
          "\nF-statistic: ",F_stat, " on ",k-1," and ",N-k-1,
          " DF,  p-value: ", p_value_F_stat,"\n")

#                            Estimate         Std. Error           t value             Pr(>|t|)
# (Intercept)        66.9151816789654   10.7060375853301  6.25022854119771 1.73336561301153e-07 ***
# Agriculture      -0.172113970941457 0.0703039231786469 -2.44814177018405   0.0186186100433133   *
# Examination      -0.258008239834722  0.253878200892098 -1.01626779663678    0.315320687313066
# Education        -0.870940062939429  0.183028601571259 -4.75849159892283  2.3228265226988e-05 ***
# Catholic          0.104115330743766  0.035257852536169  2.95296858017545  0.00513556154915653  **
# Infant.Mortality   1.07704814069103  0.381719650858061  2.82156849475775  0.00726899472564356  **

# Residual standard error:  7.165  on  41  degrees of freedom
# Multiple R-squared:  0.706735  Adjusted R-squared:  0.670971
# F-statistic:  19.76106  on  4  and  41  DF,  p-value:  5.593799e-10
	### Linear Regression Using lm() ----------------------------------------
	data("swiss")
	dat <- swiss
	linear_model <- lm(Fertility ~ ., data = dat)
	summary(linear_model)


	# Call:
	# lm(formula = Fertility ~ ., data = dat)
	#
	# Residuals:
	# Min 1Q Median 3Q Max
	# -15.2743 -5.2617 0.5032 4.1198 15.3213
	#
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 66.91518 10.70604 6.250 1.91e-07 ***
	# Agriculture -0.17211 0.07030 -2.448 0.01873 *
	# Examination -0.25801 0.25388 -1.016 0.31546
	# Education -0.87094 0.18303 -4.758 2.43e-05 ***
	# Catholic 0.10412 0.03526 2.953 0.00519 **
	# Infant.Mortality 1.07705 0.38172 2.822 0.00734 **
	# ---
	# Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
	#
	# Residual standard error: 7.165 on 41 degrees of freedom
	# Multiple R-squared: 0.7067, Adjusted R-squared: 0.671
	# F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10


	### Using Linear Algebra ------------------------------------------------

	y <- matrix(dat$Fertility, nrow = nrow(dat))
	X <- cbind(1, as.matrix(x = dat[,-1]))
	colnames(X)[1] <- "(Intercept)"

	# N x k matrix
	N <- nrow(X)
	k <- ncol(X) - 1 # number of predictor variables (ergo, excluding Intercept column)

	# Estimated Regression Coefficients
	beta_hat <- solve(t(X)%%X)%%(t(X)%*%y)

	# Variance of outcome variable = Variance of residuals
	sigma_sq <- residual_variance <- (N-k-1)^-1 * sum((y - X %*% beta_hat)^2)
	residual_std_error <- sqrt(residual_variance)

	# Variance and Std. Error of estimated coefficients of the linear model
	var_betaHat <- sigma_sq * solve(t(X) %*% X)
	coeff_std_errors <- sqrt(diag(var_betaHat))

	# t values of estimates are ratio of estimated coefficients to std. errors
	t_values <- beta_hat / coeff_std_errors

	# p-values of t-statistics of estimated coefficeints
	p_values_tstat <- 2 * pt(abs(t_values), N-k, lower.tail = FALSE)

	# assigning R's significance codes to obtained p-values
	signif_codes_match <- function(x){
	ifelse(x <= 0.001,"***",
	ifelse(x <= 0.01,"**",
	ifelse(x < 0.05,"*",
	ifelse(x < 0.1,"."," "))))

	}
	signif_codes <- sapply(p_values_tstat, signif_codes_match)

	# R-squared and Adjusted R-squared (refer any econometrics / statistics textbook)
	R_sq <- 1 - (N-k-1)residual_variance / (Nmean((y - mean(y))^2))
	R_sq_adj <- 1 - residual_variance / ((N/(N-1))*mean((y - mean(y))^2))

	# Residual sum of squares (RSS) for the full model
	RSS <- (N-k-1)*residual_variance
	# RSS for the partial model with only intercept (equal to mean), ergo, TSS
	RSS0 <- TSS <- sum((y - mean(y))^2)

	# F statistic based on RSS for full and partial models
	# k = degress of freedom of partial model
	# N - k - 1 = degress of freedom of full model
	F_stat <- ((RSS0 - RSS)/k) / (RSS/(N-k-1))

	# p-values of the F statistic
	p_value_F_stat <- pf(F_stat, df1 = k, df2 = N-k-1, lower.tail = FALSE)

	# stitch the main results toghether
	lm_results <- as.data.frame(cbind(beta_hat, coeff_std_errors,
	t_values, p_values_tstat, signif_codes))
	colnames(lm_results) <- c("Estimate","Std. Error","t value","Pr(>\|t\|)","")


	### Print out results of all relevant calcualtions -----------------------


	print(lm_results)
	cat("Residual standard error: ",
	round(residual_std_error, digits = 3),
	" on ",N-k-1," degrees of freedom",
	"\nMultiple R-squared: ",R_sq," Adjusted R-squared: ",R_sq_adj,
	"\nF-statistic: ",F_stat, " on ",k-1," and ",N-k-1,
	" DF, p-value: ", p_value_F_stat,"\n")

	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 66.9151816789654 10.7060375853301 6.25022854119771 1.73336561301153e-07 ***
	# Agriculture -0.172113970941457 0.0703039231786469 -2.44814177018405 0.0186186100433133 *
	# Examination -0.258008239834722 0.253878200892098 -1.01626779663678 0.315320687313066
	# Education -0.870940062939429 0.183028601571259 -4.75849159892283 2.3228265226988e-05 ***
	# Catholic 0.104115330743766 0.035257852536169 2.95296858017545 0.00513556154915653 **
	# Infant.Mortality 1.07704814069103 0.381719650858061 2.82156849475775 0.00726899472564356 **

	# Residual standard error: 7.165 on 41 degrees of freedom
	# Multiple R-squared: 0.706735 Adjusted R-squared: 0.670971
	# F-statistic: 19.76106 on 4 and 41 DF, p-value: 5.593799e-10