aschleg/multiple_regression_two_predictors.R

## multiple_regression_two_predictors.R
# Simple function for fitting a regression line with two predictor variables.
# Function takes two arguments:
#   x must be a matrix or dataframe with only two columns (variables)
#   y must be a vector containing the response values of the same length
# Output is similar to the lm() function
# Function used as a demonstration in post on multiple regression here: http://wp.me/p4aZEo-5Lh

two.predictor.regression <- function(x, y) {
  y <- as.matrix(y)
  if (ncol(y) > 1) {
    stop('y must be a vector')
  }

  x <- data.frame(x)
  if (ncol(x) != 2) {
    stop('x must have only two predictor variables')
  }

  x$intercept <- 1 # Add the intercept term to the X matrix
  col <- grep('intercept', names(x))
  x <- as.matrix(x[, c(col, (1:ncol(x))[-col])]) # Put the intercept term first in the X matrix
  n <- length(y)
  if (n != length(x[,1])) {
    stop('x and y must be the same length')
  }

  # Degrees of freedom. As only two predictor variables and one response variable are considered, the degrees of freedom will always be 3
  df <- 3
  x.crossprod <- crossprod(x) # Calculate crossproduct X'X
  y.crossprod <- crossprod(y) # Calculate crossproduct Y'Y
  xy.crossprod <- crossprod(x, y) # crossproduct X'Y

  x.inv <- solve(x.crossprod) # Find inverse of X'X
  b <- x.inv %*% xy.crossprod # Get b parameters by multiplying the inverse of X'X and X'Y

  fitted.y <- x %*% b # Find the fitted response values
  resid.mat <- y - fitted.y # Find the residual values

  min.resid <- min(resid.mat)
  max.resid <- max(resid.mat)
  med.resid <- median(resid.mat)
  Q1.resid <- quantile(resid.mat, .25)
  Q3.resid <- quantile(resid.mat, .75)

  resid.df <- data.frame(round(cbind(min.resid, Q1.resid, med.resid, Q3.resid, max.resid), 3), row.names = '')
  colnames(resid.df) <- c('Min', '1Q', 'Median', '3Q', 'Max')

  sse <- y.crossprod - crossprod(b, xy.crossprod) # SSE
  mse <- sse / (n - df) # MSE

  sst <- y.crossprod - sum(y)^2 / n # SST

  ssr <- sst - sse # SSR
  msr <- ssr / 2 # MSR

  f.stat <- msr / mse # Critical F-statistic
  p.val <- pf(f.stat, df1 = 2, df2 = n - df, lower.tail = FALSE) # Computed p-value
  R2 <- 1 - sse / sst # R-squared
  R2.adj <- 1 - ((n - 1) / (n - df)) * sse / sst # adjusted R-squared

  s2b <- mse[1] * x.inv # Estimate variance-covariance matrix of regression parameters

  # Get the parameter standard errors
  std.err <- suppressWarnings(matrix(sqrt(s2b)[!is.na(sqrt(s2b))], nrow = 3, ncol=1))
  t.values <- b / std.err # Critical t-values of regression parameters
  p.values <- 2 * pt(t.values, df = n - df, lower.tail = FALSE)
  res.std.err <- sqrt(mse)

  # The following collects and returns all the results found above into a list
  coef <- data.frame(cbind(round(b,3), round(std.err,3), round(t.values,3), format(p.values, digits = 3)))
  colnames(coef) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')

  resids <- paste('Residual standard error:', round(res.std.err, 3), 'on', length(delivery$n.prod - df), ' degrees of freedom')
  R.squared <- paste('Multiple R-squared: ', round(R2,4), 'Adjusted R-squared: ', round(R2.adj,4))
  f.stat <- paste('F-statistic: ', round(f.stat, 1), 'on 2 and', length(delivery$n.prod - df), ' DF')
  p <- paste('p-value: ', format(p.val, digits = 4, scientific = TRUE))

  res <- list('Residuals'=resid.df, 'Coefficients'=coef, resids, R.squared, f.stat, p)
  return(res)
}
	# Simple function for fitting a regression line with two predictor variables.
	# Function takes two arguments:
	# x must be a matrix or dataframe with only two columns (variables)
	# y must be a vector containing the response values of the same length
	# Output is similar to the lm() function
	# Function used as a demonstration in post on multiple regression here: http://wp.me/p4aZEo-5Lh

	two.predictor.regression <- function(x, y) {
	y <- as.matrix(y)
	if (ncol(y) > 1) {
	stop('y must be a vector')
	}

	x <- data.frame(x)
	if (ncol(x) != 2) {
	stop('x must have only two predictor variables')
	}

	x$intercept <- 1 # Add the intercept term to the X matrix
	col <- grep('intercept', names(x))
	x <- as.matrix(x[, c(col, (1:ncol(x))[-col])]) # Put the intercept term first in the X matrix
	n <- length(y)
	if (n != length(x[,1])) {
	stop('x and y must be the same length')
	}

	# Degrees of freedom. As only two predictor variables and one response variable are considered, the degrees of freedom will always be 3
	df <- 3
	x.crossprod <- crossprod(x) # Calculate crossproduct X'X
	y.crossprod <- crossprod(y) # Calculate crossproduct Y'Y
	xy.crossprod <- crossprod(x, y) # crossproduct X'Y

	x.inv <- solve(x.crossprod) # Find inverse of X'X
	b <- x.inv %*% xy.crossprod # Get b parameters by multiplying the inverse of X'X and X'Y

	fitted.y <- x %*% b # Find the fitted response values
	resid.mat <- y - fitted.y # Find the residual values

	min.resid <- min(resid.mat)
	max.resid <- max(resid.mat)
	med.resid <- median(resid.mat)
	Q1.resid <- quantile(resid.mat, .25)
	Q3.resid <- quantile(resid.mat, .75)

	resid.df <- data.frame(round(cbind(min.resid, Q1.resid, med.resid, Q3.resid, max.resid), 3), row.names = '')
	colnames(resid.df) <- c('Min', '1Q', 'Median', '3Q', 'Max')

	sse <- y.crossprod - crossprod(b, xy.crossprod) # SSE
	mse <- sse / (n - df) # MSE

	sst <- y.crossprod - sum(y)^2 / n # SST

	ssr <- sst - sse # SSR
	msr <- ssr / 2 # MSR

	f.stat <- msr / mse # Critical F-statistic
	p.val <- pf(f.stat, df1 = 2, df2 = n - df, lower.tail = FALSE) # Computed p-value
	R2 <- 1 - sse / sst # R-squared
	R2.adj <- 1 - ((n - 1) / (n - df)) * sse / sst # adjusted R-squared

	s2b <- mse[1] * x.inv # Estimate variance-covariance matrix of regression parameters

	# Get the parameter standard errors
	std.err <- suppressWarnings(matrix(sqrt(s2b)[!is.na(sqrt(s2b))], nrow = 3, ncol=1))
	t.values <- b / std.err # Critical t-values of regression parameters
	p.values <- 2 * pt(t.values, df = n - df, lower.tail = FALSE)
	res.std.err <- sqrt(mse)

	# The following collects and returns all the results found above into a list
	coef <- data.frame(cbind(round(b,3), round(std.err,3), round(t.values,3), format(p.values, digits = 3)))
	colnames(coef) <- c('Estimate', 'Std. Error', 't value', 'Pr(>\|t\|)')

	resids <- paste('Residual standard error:', round(res.std.err, 3), 'on', length(delivery$n.prod - df), ' degrees of freedom')
	R.squared <- paste('Multiple R-squared: ', round(R2,4), 'Adjusted R-squared: ', round(R2.adj,4))
	f.stat <- paste('F-statistic: ', round(f.stat, 1), 'on 2 and', length(delivery$n.prod - df), ' DF')
	p <- paste('p-value: ', format(p.val, digits = 4, scientific = TRUE))

	res <- list('Residuals'=resid.df, 'Coefficients'=coef, resids, R.squared, f.stat, p)
	return(res)
	}