Created
August 4, 2016 19:56
-
-
Save aschleg/7299355048c50907db27b672f98c05c6 to your computer and use it in GitHub Desktop.
Example function for fitting a multiple regression model with two predictor variables
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Simple function for fitting a regression line with two predictor variables. | |
# Function takes two arguments: | |
# x must be a matrix or dataframe with only two columns (variables) | |
# y must be a vector containing the response values of the same length | |
# Output is similar to the lm() function | |
# Function used as a demonstration in post on multiple regression here: http://wp.me/p4aZEo-5Lh | |
two.predictor.regression <- function(x, y) { | |
y <- as.matrix(y) | |
if (ncol(y) > 1) { | |
stop('y must be a vector') | |
} | |
x <- data.frame(x) | |
if (ncol(x) != 2) { | |
stop('x must have only two predictor variables') | |
} | |
x$intercept <- 1 # Add the intercept term to the X matrix | |
col <- grep('intercept', names(x)) | |
x <- as.matrix(x[, c(col, (1:ncol(x))[-col])]) # Put the intercept term first in the X matrix | |
n <- length(y) | |
if (n != length(x[,1])) { | |
stop('x and y must be the same length') | |
} | |
# Degrees of freedom. As only two predictor variables and one response variable are considered, the degrees of freedom will always be 3 | |
df <- 3 | |
x.crossprod <- crossprod(x) # Calculate crossproduct X'X | |
y.crossprod <- crossprod(y) # Calculate crossproduct Y'Y | |
xy.crossprod <- crossprod(x, y) # crossproduct X'Y | |
x.inv <- solve(x.crossprod) # Find inverse of X'X | |
b <- x.inv %*% xy.crossprod # Get b parameters by multiplying the inverse of X'X and X'Y | |
fitted.y <- x %*% b # Find the fitted response values | |
resid.mat <- y - fitted.y # Find the residual values | |
min.resid <- min(resid.mat) | |
max.resid <- max(resid.mat) | |
med.resid <- median(resid.mat) | |
Q1.resid <- quantile(resid.mat, .25) | |
Q3.resid <- quantile(resid.mat, .75) | |
resid.df <- data.frame(round(cbind(min.resid, Q1.resid, med.resid, Q3.resid, max.resid), 3), row.names = '') | |
colnames(resid.df) <- c('Min', '1Q', 'Median', '3Q', 'Max') | |
sse <- y.crossprod - crossprod(b, xy.crossprod) # SSE | |
mse <- sse / (n - df) # MSE | |
sst <- y.crossprod - sum(y)^2 / n # SST | |
ssr <- sst - sse # SSR | |
msr <- ssr / 2 # MSR | |
f.stat <- msr / mse # Critical F-statistic | |
p.val <- pf(f.stat, df1 = 2, df2 = n - df, lower.tail = FALSE) # Computed p-value | |
R2 <- 1 - sse / sst # R-squared | |
R2.adj <- 1 - ((n - 1) / (n - df)) * sse / sst # adjusted R-squared | |
s2b <- mse[1] * x.inv # Estimate variance-covariance matrix of regression parameters | |
# Get the parameter standard errors | |
std.err <- suppressWarnings(matrix(sqrt(s2b)[!is.na(sqrt(s2b))], nrow = 3, ncol=1)) | |
t.values <- b / std.err # Critical t-values of regression parameters | |
p.values <- 2 * pt(t.values, df = n - df, lower.tail = FALSE) | |
res.std.err <- sqrt(mse) | |
# The following collects and returns all the results found above into a list | |
coef <- data.frame(cbind(round(b,3), round(std.err,3), round(t.values,3), format(p.values, digits = 3))) | |
colnames(coef) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)') | |
resids <- paste('Residual standard error:', round(res.std.err, 3), 'on', length(delivery$n.prod - df), ' degrees of freedom') | |
R.squared <- paste('Multiple R-squared: ', round(R2,4), 'Adjusted R-squared: ', round(R2.adj,4)) | |
f.stat <- paste('F-statistic: ', round(f.stat, 1), 'on 2 and', length(delivery$n.prod - df), ' DF') | |
p <- paste('p-value: ', format(p.val, digits = 4, scientific = TRUE)) | |
res <- list('Residuals'=resid.df, 'Coefficients'=coef, resids, R.squared, f.stat, p) | |
return(res) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment