aschleg/simple_linear_regression.R

## simple_linear_regression.R
# Quick function for performing linear regression with only one predictor variable.
# Takes two arguments, x and y. Variables must have same length.
# Output will be similar to output of lm() with one predictor.
# Gist created as part of post on linear regression:  http://www.aaronschlegel.com/notebook/simple-linear-regression-models-r/

simple.linear.coef <- function(x, y) {
  # Find length of x to get sample size. Assuming x and y have the same sample size.
  n <- length(x)

  if (n != length(y))
    stop('variables are not the same size')

  # Calculate the error statistics Sxx, Syy, and Sxy
  sxx <- sum(x^2) - sum(x)^2 / n
  syy <- sum(y^2) - sum(y)^2 / n
  sxy <- sum(x * y) - (sum(x) * sum(y)) / n
  # Coefficients beta0 and beta1
  b1 <- sxy / sxx
  b0 <- mean(y) - b1 * mean(x)
  # Sum of standard error and Mean Standard Error
  sse <- syy - sxy^2 / sxx
  mse <- sse / (n - 2)
  # Standard error beta0 and beta1
  b1.err <- sqrt(mse) / sqrt(sxx)
  b0.err <- sqrt(mse) / sqrt(n) * sqrt(1 + (mean(x)^2 / (sum((x - mean(x))^2) / n)))
  # beta0 and beta1 t-values
  b0.t <- (b0 - 0) / b0.err
  b1.t <- (b1 - 0) / b1.err
  # p-values of beta0 and beta1
  b0.p <- 2 * pt(b0.t, df = n - 2)
  b1.p <- 2 * pt(b1.t, df = n - 2, lower.tail = FALSE)
  # Coefficient of determination R-squared
  r2 <- (syy - sse) /syy
  # R-squared adjusted
  r2.adj <- r2 - (1 - r2) * ((2 - 1) / (length(y) - 2))

  rsquare <- paste('Multiple R-squared: ', round(r2, 4), ', Adjusted R-squared: ', round(r2.adj, 4))

  coeffs <- data.frame(cbind(c(b0, b1), c(b0.err, b1.err), c(b0.t, b1.t), c(b0.p, b1.p)))
  colnames(coeffs) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
  rownames(coeffs) <- c('Intercept', 'x1')

  # Fit the line to the data with beta0 and beta1 found above
  fitted <- x * b1 + b0

  # The F-Statistic
  msr <- sum((fitted - mean(y))^2) / 1
  mse2 <- sum((y - fitted)^2) / (length(y) - 2)
  f <- msr / mse2
  # p-value
  p <- pf(f, 1, length(y) - 2, lower.tail = FALSE)

  f.stat <- paste('F-statistic: ', round(f, 2), ' on 1 and ', n - 2, ' DF, p-value: ', format(p, digits = 3, scientific = TRUE))
  # Calculate and find summary statistics of the residuals
  resd <- y - fitted
  min.res <- round(min(resd), 3)
  max.res <- round(max(resd), 3)
  q1.q3 <- quantile(resd, probs = c(.25, .75)) # 1st and 3rd quartiles of the residuals
  med <- round(median(resd), 3)
  residual <- data.frame(cbind(min.res, round(q1.q3[1], 3), med, round(q1.q3[2], 3), max.res))
  colnames(residual) <- c('Min', 'Q1', 'Median', 'Q3', 'Max')
  resdi <- paste('Residual standard error: ', round(sqrt(mse2), 2), ' on ', n - 2, ' degrees of freedom')
  regres <- list('Residuals'=residual, 'Coefficients'=coeffs, resdi, rsquare, f.stat)

  return(regres)
}
	# Quick function for performing linear regression with only one predictor variable.
	# Takes two arguments, x and y. Variables must have same length.
	# Output will be similar to output of lm() with one predictor.
	# Gist created as part of post on linear regression: http://www.aaronschlegel.com/notebook/simple-linear-regression-models-r/

	simple.linear.coef <- function(x, y) {
	# Find length of x to get sample size. Assuming x and y have the same sample size.
	n <- length(x)

	if (n != length(y))
	stop('variables are not the same size')

	# Calculate the error statistics Sxx, Syy, and Sxy
	sxx <- sum(x^2) - sum(x)^2 / n
	syy <- sum(y^2) - sum(y)^2 / n
	sxy <- sum(x * y) - (sum(x) * sum(y)) / n
	# Coefficients beta0 and beta1
	b1 <- sxy / sxx
	b0 <- mean(y) - b1 * mean(x)
	# Sum of standard error and Mean Standard Error
	sse <- syy - sxy^2 / sxx
	mse <- sse / (n - 2)
	# Standard error beta0 and beta1
	b1.err <- sqrt(mse) / sqrt(sxx)
	b0.err <- sqrt(mse) / sqrt(n) * sqrt(1 + (mean(x)^2 / (sum((x - mean(x))^2) / n)))
	# beta0 and beta1 t-values
	b0.t <- (b0 - 0) / b0.err
	b1.t <- (b1 - 0) / b1.err
	# p-values of beta0 and beta1
	b0.p <- 2 * pt(b0.t, df = n - 2)
	b1.p <- 2 * pt(b1.t, df = n - 2, lower.tail = FALSE)
	# Coefficient of determination R-squared
	r2 <- (syy - sse) /syy
	# R-squared adjusted
	r2.adj <- r2 - (1 - r2) * ((2 - 1) / (length(y) - 2))

	rsquare <- paste('Multiple R-squared: ', round(r2, 4), ', Adjusted R-squared: ', round(r2.adj, 4))

	coeffs <- data.frame(cbind(c(b0, b1), c(b0.err, b1.err), c(b0.t, b1.t), c(b0.p, b1.p)))
	colnames(coeffs) <- c('Estimate', 'Std. Error', 't value', 'Pr(>\|t\|)')
	rownames(coeffs) <- c('Intercept', 'x1')

	# Fit the line to the data with beta0 and beta1 found above
	fitted <- x * b1 + b0

	# The F-Statistic
	msr <- sum((fitted - mean(y))^2) / 1
	mse2 <- sum((y - fitted)^2) / (length(y) - 2)
	f <- msr / mse2
	# p-value
	p <- pf(f, 1, length(y) - 2, lower.tail = FALSE)

	f.stat <- paste('F-statistic: ', round(f, 2), ' on 1 and ', n - 2, ' DF, p-value: ', format(p, digits = 3, scientific = TRUE))
	# Calculate and find summary statistics of the residuals
	resd <- y - fitted
	min.res <- round(min(resd), 3)
	max.res <- round(max(resd), 3)
	q1.q3 <- quantile(resd, probs = c(.25, .75)) # 1st and 3rd quartiles of the residuals
	med <- round(median(resd), 3)
	residual <- data.frame(cbind(min.res, round(q1.q3[1], 3), med, round(q1.q3[2], 3), max.res))
	colnames(residual) <- c('Min', 'Q1', 'Median', 'Q3', 'Max')
	resdi <- paste('Residual standard error: ', round(sqrt(mse2), 2), ' on ', n - 2, ' degrees of freedom')
	regres <- list('Residuals'=residual, 'Coefficients'=coeffs, resdi, rsquare, f.stat)

	return(regres)
	}