graebnerc/#T13T14: Solution to exercises

## #T13T14: Solution to exercises
Exemple solutions to the two exercises of the session on simple linear regression.

## ex-1-slide35.R
# Exercise 1: estimate a linear model
library(DataScienceExercises)
library(dplyr)
library(skimr)
library(moderndive)
library(ggplot2)

# Prepare and investigate data:
beer_data <- DataScienceExercises::beer
dplyr::glimpse(beer_data)
skimr::skim(beer_data)

# Estimate the model:
lin_reg <- lm(consumption~income, data=beer_data)

# Look at result:
moderndive::get_regression_table(lin_reg)

# Visualization:
ggplot(
  data = beer_data,
  mapping = aes(x=income, y=consumption)
  ) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)+
  theme_bw()

## ex-2-slide43.R
# Final exercise: computing R2 by hand
library(DataScienceExercises)
library(dplyr)
library(skimr)
library(moderndive)
library(ggplot2)

# Preparation and looking at the data:
beer_data <- DataScienceExercises::beer
dplyr::glimpse(beer_data)
skimr::skim(beer_data)

# Estimating the model:
lin_reg_income <- lm(consumption~income, data=beer_data)

# TSS
#  First compute mean consumption:
mean_consumption <- mean(beer_data$consumption)
#  TSS as the sum of the squared deviation of actual values from sample mean:
tss <- sum(
  (beer_data$consumption - mean_consumption)**2
  )

# RSS
#  RSS as the sum of squared residuals
#  Residuals for each regression part of the object produced by lm()
#  Note: to see all the elements of the list produced by lm() use names():
names()
rss <- sum(
  (lin_reg_income$residuals)**2
  )
rss

# ESS
#  RSS as the sum of squared distance betwee fitted and actual values
ess <- sum(
  (lin_reg_income$fitted.values - mean_consumption)**2
  )
ess

# R2: manual computation:
ess/tss
# Note:
tss - (ess+rss) # should be zero

# Direct computation
#  R2 is not part of the list created by lm(); need to compute a
#  summary object before:
lin_reg_income_summary <- summary(lin_reg_income)
lin_reg_income_summary$r.squared

# Visual illustration:
ggplot(
  data = beer_data,
  mapping = aes(x=income, y=consumption)
) +
  geom_point() +
  geom_hline(yintercept = mean_consumption) +
  geom_smooth(method = "lm", se = FALSE)+
  theme_bw()

# Comparison with the model with price instead of income as explanatory variable:
lin_reg_price <- lm(consumption~price, data = beer_data)
lin_reg_price_summary <- summary(lin_reg_price)
lin_reg_price_summary$r.squared

ggplot(
  data = beer_data,
  mapping = aes(x=price, y=consumption)
) +
  geom_point() +
  geom_hline(yintercept = mean_consumption) +
  geom_smooth(method = "lm", se = FALSE)+
  theme_bw()
	# Exercise 1: estimate a linear model
	library(DataScienceExercises)
	library(dplyr)
	library(skimr)
	library(moderndive)
	library(ggplot2)

	# Prepare and investigate data:
	beer_data <- DataScienceExercises::beer
	dplyr::glimpse(beer_data)
	skimr::skim(beer_data)

	# Estimate the model:
	lin_reg <- lm(consumption~income, data=beer_data)

	# Look at result:
	moderndive::get_regression_table(lin_reg)

	# Visualization:
	ggplot(
	data = beer_data,
	mapping = aes(x=income, y=consumption)
	) +
	geom_point() +
	geom_smooth(method = "lm", se = FALSE)+
	theme_bw()
	# Final exercise: computing R2 by hand
	library(DataScienceExercises)
	library(dplyr)
	library(skimr)
	library(moderndive)
	library(ggplot2)

	# Preparation and looking at the data:
	beer_data <- DataScienceExercises::beer
	dplyr::glimpse(beer_data)
	skimr::skim(beer_data)

	# Estimating the model:
	lin_reg_income <- lm(consumption~income, data=beer_data)

	# TSS
	# First compute mean consumption:
	mean_consumption <- mean(beer_data$consumption)
	# TSS as the sum of the squared deviation of actual values from sample mean:
	tss <- sum(
	(beer_data$consumption - mean_consumption)**2
	)

	# RSS
	# RSS as the sum of squared residuals
	# Residuals for each regression part of the object produced by lm()
	# Note: to see all the elements of the list produced by lm() use names():
	names()
	rss <- sum(
	(lin_reg_income$residuals)**2
	)
	rss

	# ESS
	# RSS as the sum of squared distance betwee fitted and actual values
	ess <- sum(
	(lin_reg_income$fitted.values - mean_consumption)**2
	)
	ess

	# R2: manual computation:
	ess/tss
	# Note:
	tss - (ess+rss) # should be zero

	# Direct computation
	# R2 is not part of the list created by lm(); need to compute a
	# summary object before:
	lin_reg_income_summary <- summary(lin_reg_income)
	lin_reg_income_summary$r.squared

	# Visual illustration:
	ggplot(
	data = beer_data,
	mapping = aes(x=income, y=consumption)
	) +
	geom_point() +
	geom_hline(yintercept = mean_consumption) +
	geom_smooth(method = "lm", se = FALSE)+
	theme_bw()

	# Comparison with the model with price instead of income as explanatory variable:
	lin_reg_price <- lm(consumption~price, data = beer_data)
	lin_reg_price_summary <- summary(lin_reg_price)
	lin_reg_price_summary$r.squared

	ggplot(
	data = beer_data,
	mapping = aes(x=price, y=consumption)
	) +
	geom_point() +
	geom_hline(yintercept = mean_consumption) +
	geom_smooth(method = "lm", se = FALSE)+
	theme_bw()