Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Created December 1, 2022 15:31
Show Gist options
  • Save graebnerc/207117d4a87025a3c64d27d8358c275b to your computer and use it in GitHub Desktop.
Save graebnerc/207117d4a87025a3c64d27d8358c275b to your computer and use it in GitHub Desktop.
#T13T14: Solution to exercises
Exemple solutions to the two exercises of the session on simple linear regression.
# Exercise 1: estimate a linear model
library(DataScienceExercises)
library(dplyr)
library(skimr)
library(moderndive)
library(ggplot2)
# Prepare and investigate data:
beer_data <- DataScienceExercises::beer
dplyr::glimpse(beer_data)
skimr::skim(beer_data)
# Estimate the model:
lin_reg <- lm(consumption~income, data=beer_data)
# Look at result:
moderndive::get_regression_table(lin_reg)
# Visualization:
ggplot(
data = beer_data,
mapping = aes(x=income, y=consumption)
) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)+
theme_bw()
# Final exercise: computing R2 by hand
library(DataScienceExercises)
library(dplyr)
library(skimr)
library(moderndive)
library(ggplot2)
# Preparation and looking at the data:
beer_data <- DataScienceExercises::beer
dplyr::glimpse(beer_data)
skimr::skim(beer_data)
# Estimating the model:
lin_reg_income <- lm(consumption~income, data=beer_data)
# TSS
# First compute mean consumption:
mean_consumption <- mean(beer_data$consumption)
# TSS as the sum of the squared deviation of actual values from sample mean:
tss <- sum(
(beer_data$consumption - mean_consumption)**2
)
# RSS
# RSS as the sum of squared residuals
# Residuals for each regression part of the object produced by lm()
# Note: to see all the elements of the list produced by lm() use names():
names()
rss <- sum(
(lin_reg_income$residuals)**2
)
rss
# ESS
# RSS as the sum of squared distance betwee fitted and actual values
ess <- sum(
(lin_reg_income$fitted.values - mean_consumption)**2
)
ess
# R2: manual computation:
ess/tss
# Note:
tss - (ess+rss) # should be zero
# Direct computation
# R2 is not part of the list created by lm(); need to compute a
# summary object before:
lin_reg_income_summary <- summary(lin_reg_income)
lin_reg_income_summary$r.squared
# Visual illustration:
ggplot(
data = beer_data,
mapping = aes(x=income, y=consumption)
) +
geom_point() +
geom_hline(yintercept = mean_consumption) +
geom_smooth(method = "lm", se = FALSE)+
theme_bw()
# Comparison with the model with price instead of income as explanatory variable:
lin_reg_price <- lm(consumption~price, data = beer_data)
lin_reg_price_summary <- summary(lin_reg_price)
lin_reg_price_summary$r.squared
ggplot(
data = beer_data,
mapping = aes(x=price, y=consumption)
) +
geom_point() +
geom_hline(yintercept = mean_consumption) +
geom_smooth(method = "lm", se = FALSE)+
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment