benfasoli/regression-coefficient-uncertainty.r

## regression-coefficient-uncertainty.r
# Ben Fasoli
rm(list = ls())

library(gtrendsR)
library(tidyverse)

# Get google trend data for searches for skis and ski boots
query <- c('skis', 'ski boots')
trend <- gtrends(query, geo = 'US', time = 'all')
trend_ts <- trend$interest_over_time %>%
  spread(key = keyword, value = hits) %>%
  select(date, skis, `ski boots`)

# Linear regression to predict searches for skis by searches for ski boots
# skis = m * ski boots + b
mod <- lm(skis ~ `ski boots`, data = trend_ts)


# Calculate the confidence interval
yi <- trend_ts$skis        # actual values
yih <- predict(mod)        # model predicted values
N <- nrow(trend_ts)        # number of observations
xi <- trend_ts$`ski boots` # independent variable values
xh <- mean(xi)             # mean of independent variable

# Calculate coefficient errors from the sum of squared errors
se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2))
se
# 0.06204275

# or from the variances (diagonals given by variance-covariance matrix)
se <- sqrt(diag(vcov(mod)))[2]
# 0.06204275

# Manually calculate 95% confidence interval
alpha <- 1 - 0.95
cp <- 1 - alpha / 2 # critical probability
DF <- N - 2         # degrees of freedom
cv <- qt(cp, DF)    # critical value from quantile of t-distribution
me <- cv * se       # margin of error

ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me)
ci
# `ski boots` `ski boots` `ski boots`
#  2.485902    2.608391    2.730880

# Validate with confidence interval derived by R from the model object
confint(mod, level = 0.95)
#                 2.5 %   97.5 %
#   (Intercept) 13.476521 16.26735
#   `ski boots`  2.485902  2.73088


# Compare coefficient standard error calculation methods
# On first order, the margin of error at the 95% confidence interval should be
# roughly 2 * se assuming a gaussian distribution
# Derived from variance-coviariance matrix diagonals (variances)
se
# [1] 0.06204275
se * 2
# 0.1240855
me
# 0.1224892

# Calculated within summary.lm table
summary(mod)
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)
# (Intercept) 14.87194    0.70680   21.04   <2e-16 ***
# `ski boots`  2.60839    0.06204   42.04   <2e-16 ***

# Alternative technique to estimate error in regression coefficients using
# random sampling with replacement (bootstrapping). Results are highly sensitive
# to the size parameter of the resampled populations
bootstrap <- function (x, fun, size = nrow(x), iter = 10) {
  sapply(1:iter, function(i) {
    id <- sample(1:nrow(x), size = size, replace = T)
    y <- x[id, ]
    fun(y)
  })
}
co <- bootstrap(trend_ts, iter = 10000, fun = function(df) {
  # Extract second coefficient (slope) from model output
  coef(lm(skis ~ `ski boots`, data = df))[2]
})
sd(co)
# 0.06463646
	# Ben Fasoli
	rm(list = ls())

	library(gtrendsR)
	library(tidyverse)

	# Get google trend data for searches for skis and ski boots
	query <- c('skis', 'ski boots')
	trend <- gtrends(query, geo = 'US', time = 'all')
	trend_ts <- trend$interest_over_time %>%
	spread(key = keyword, value = hits) %>%
	select(date, skis, `ski boots`)

	# Linear regression to predict searches for skis by searches for ski boots
	# skis = m * ski boots + b
	mod <- lm(skis ~ `ski boots`, data = trend_ts)


	# Calculate the confidence interval
	yi <- trend_ts$skis # actual values
	yih <- predict(mod) # model predicted values
	N <- nrow(trend_ts) # number of observations
	xi <- trend_ts$`ski boots` # independent variable values
	xh <- mean(xi) # mean of independent variable

	# Calculate coefficient errors from the sum of squared errors
	se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2))
	se
	# 0.06204275

	# or from the variances (diagonals given by variance-covariance matrix)
	se <- sqrt(diag(vcov(mod)))[2]
	# 0.06204275

	# Manually calculate 95% confidence interval
	alpha <- 1 - 0.95
	cp <- 1 - alpha / 2 # critical probability
	DF <- N - 2 # degrees of freedom
	cv <- qt(cp, DF) # critical value from quantile of t-distribution
	me <- cv * se # margin of error

	ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me)
	ci
	# `ski boots` `ski boots` `ski boots`
	# 2.485902 2.608391 2.730880

	# Validate with confidence interval derived by R from the model object
	confint(mod, level = 0.95)
	# 2.5 % 97.5 %
	# (Intercept) 13.476521 16.26735
	# `ski boots` 2.485902 2.73088


	# Compare coefficient standard error calculation methods
	# On first order, the margin of error at the 95% confidence interval should be
	# roughly 2 * se assuming a gaussian distribution
	# Derived from variance-coviariance matrix diagonals (variances)
	se
	# [1] 0.06204275
	se * 2
	# 0.1240855
	me
	# 0.1224892

	# Calculated within summary.lm table
	summary(mod)
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 14.87194 0.70680 21.04 <2e-16 ***
	# `ski boots` 2.60839 0.06204 42.04 <2e-16 ***

	# Alternative technique to estimate error in regression coefficients using
	# random sampling with replacement (bootstrapping). Results are highly sensitive
	# to the size parameter of the resampled populations
	bootstrap <- function (x, fun, size = nrow(x), iter = 10) {
	sapply(1:iter, function(i) {
	id <- sample(1:nrow(x), size = size, replace = T)
	y <- x[id, ]
	fun(y)
	})
	}
	co <- bootstrap(trend_ts, iter = 10000, fun = function(df) {
	# Extract second coefficient (slope) from model output
	coef(lm(skis ~ `ski boots`, data = df))[2]
	})
	sd(co)
	# 0.06463646