Created
December 1, 2022 15:31
-
-
Save graebnerc/207117d4a87025a3c64d27d8358c275b to your computer and use it in GitHub Desktop.
#T13T14: Solution to exercises
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Exemple solutions to the two exercises of the session on simple linear regression. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Exercise 1: estimate a linear model | |
library(DataScienceExercises) | |
library(dplyr) | |
library(skimr) | |
library(moderndive) | |
library(ggplot2) | |
# Prepare and investigate data: | |
beer_data <- DataScienceExercises::beer | |
dplyr::glimpse(beer_data) | |
skimr::skim(beer_data) | |
# Estimate the model: | |
lin_reg <- lm(consumption~income, data=beer_data) | |
# Look at result: | |
moderndive::get_regression_table(lin_reg) | |
# Visualization: | |
ggplot( | |
data = beer_data, | |
mapping = aes(x=income, y=consumption) | |
) + | |
geom_point() + | |
geom_smooth(method = "lm", se = FALSE)+ | |
theme_bw() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Final exercise: computing R2 by hand | |
library(DataScienceExercises) | |
library(dplyr) | |
library(skimr) | |
library(moderndive) | |
library(ggplot2) | |
# Preparation and looking at the data: | |
beer_data <- DataScienceExercises::beer | |
dplyr::glimpse(beer_data) | |
skimr::skim(beer_data) | |
# Estimating the model: | |
lin_reg_income <- lm(consumption~income, data=beer_data) | |
# TSS | |
# First compute mean consumption: | |
mean_consumption <- mean(beer_data$consumption) | |
# TSS as the sum of the squared deviation of actual values from sample mean: | |
tss <- sum( | |
(beer_data$consumption - mean_consumption)**2 | |
) | |
# RSS | |
# RSS as the sum of squared residuals | |
# Residuals for each regression part of the object produced by lm() | |
# Note: to see all the elements of the list produced by lm() use names(): | |
names() | |
rss <- sum( | |
(lin_reg_income$residuals)**2 | |
) | |
rss | |
# ESS | |
# RSS as the sum of squared distance betwee fitted and actual values | |
ess <- sum( | |
(lin_reg_income$fitted.values - mean_consumption)**2 | |
) | |
ess | |
# R2: manual computation: | |
ess/tss | |
# Note: | |
tss - (ess+rss) # should be zero | |
# Direct computation | |
# R2 is not part of the list created by lm(); need to compute a | |
# summary object before: | |
lin_reg_income_summary <- summary(lin_reg_income) | |
lin_reg_income_summary$r.squared | |
# Visual illustration: | |
ggplot( | |
data = beer_data, | |
mapping = aes(x=income, y=consumption) | |
) + | |
geom_point() + | |
geom_hline(yintercept = mean_consumption) + | |
geom_smooth(method = "lm", se = FALSE)+ | |
theme_bw() | |
# Comparison with the model with price instead of income as explanatory variable: | |
lin_reg_price <- lm(consumption~price, data = beer_data) | |
lin_reg_price_summary <- summary(lin_reg_price) | |
lin_reg_price_summary$r.squared | |
ggplot( | |
data = beer_data, | |
mapping = aes(x=price, y=consumption) | |
) + | |
geom_point() + | |
geom_hline(yintercept = mean_consumption) + | |
geom_smooth(method = "lm", se = FALSE)+ | |
theme_bw() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment