graebnerc/#T17T18: Lecture notes

## #T17T18: Lecture notes
Contains solutions to the exercises as well as the code used to create the relevant figures in the slides.
The code assumes a standard folder structure and might need to be adjusted to your case.

## Examples-Durkheim.R
here::i_am("R/Examples-Durkheim.R")
library(here)
library(dplyr)
library(data.table)
library(ggplot2)
library(modelsummary)
library(kableExtra)
library(icaeDesign)

# Create the artificial Durkheim data set--------
set.seed(123)
n_obs <- 1000
social_cohesion <- seq(-0.5, 0.5, length.out = n_obs)
p_suicide <- social_cohesion**2 + rnorm(n_obs, mean = 0, sd = 0.005)

suicide_data <- tibble(
  `Social cohesion`=social_cohesion,
  `Probability for suicide`=p_suicide
) %>%
  dplyr::mutate(
    Society = ifelse(`Social cohesion`<=0, "Modern", "Archaic"),
    `Social cohesion squared` = `Social cohesion`**2
  )

data.table::fwrite(
  x = suicide_data, file = here::here("data/suicide_data.csv"))

# Estimate the two models--------------
linmod <- lm(
  formula = `Probability for suicide`~`Social cohesion`,
  data = suicide_data)
quadmod <- lm(
  formula = `Probability for suicide`~`Social cohesion`+I(`Social cohesion`**2),
  data = suicide_data)

# Create a PDF of the regression table:
modelsummary(
  list("Model 1"=linmod, "Model 2"=quadmod),
  output = "kableExtra", gof_map = c("r.squared"),
  statistic = NULL) %>%
  kableExtra::save_kable(
    file = here("output/RegressionTable-Durkheim.pdf"))

# Add the predictions of the models to the data
suicide_data_predictions <- suicide_data %>%
  mutate(
    `Model 1` = predict(linmod),
    `Model 2` = predict(quadmod)
    ) %>%
  select(`Social cohesion`, `Model 1`, `Model 2`) %>%
  tidyr::pivot_longer(
    cols = c("Model 1", "Model 2"),
    names_to = "Model",
    values_to = "Prediction")

# Create the three plots---------------
## Plot 1: the data--------------------
suicide_data_plot <- ggplot(
  data = suicide_data,
  mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
) +
  geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
  labs(title = "Suicides a la Durkheim") +
  theme_icae()
suicide_data_plot

ggsave(plot = suicide_data_plot,
       filename = here("output/DurkheimData.pdf"),
       width = 3.5, height = 3.5)

## Plot 2: the data with cohesion squared--------------------
suicide_data_plot_squared <- ggplot(
  data = suicide_data,
  mapping = aes(x=`Social cohesion squared`, y=`Probability for suicide`)
) +
  geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
  labs(title = "Suicides a la Durkheim") +
  theme_icae()
suicide_data_plot_squared

ggsave(plot = suicide_data_plot_squared,
       filename = here("output/DurkheimData_squared.pdf"),
       width = 3.5, height = 3.5)

## Plot 3: the data with cohesion squared--------------------
suicide_data_plot_predict_plot <- ggplot(
  data = suicide_data,
  mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
) +
  geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
  geom_line(
    data = suicide_data_predictions,
    mapping = aes(x=`Social cohesion`, y=Prediction, color=Model), linewidth=1
    ) +
  scale_color_euf(palette = "mixed") +
  labs(title = "Suicides a la Durkheim") +
  theme_icae() +
  theme(legend.position = "bottom")
suicide_data_plot_predict_plot

ggsave(plot = suicide_data_plot_predict_plot,
       filename = here("output/Durkheim-Predictions.pdf"),
       width = 3.5, height = 3.5)

## Tukey-Anscombe plots-------------------
simple_model_TA_plot <- tibble(
  "Fitted values" = linmod$fitted.values,
  "Residuals" = linmod$residuals
  ) %>%
  ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
  geom_point(alpha=0.25) +
  labs(title = "TA plot (model 1)") +
  theme_icae()
simple_model_TA_plot

ggsave(plot = simple_model_TA_plot,
       filename = here("output/Durkheim-TA-Simple.pdf"),
       width = 3.5, height = 3.5)

quadratic_model_plot <- tibble(
  "Fitted values" = quadmod$fitted.values,
  "Residuals" = quadmod$residuals
) %>%
  ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
  geom_point(alpha=0.25) +
  labs(title = "TA plot (model 2)") +
  theme_icae()
quadratic_model_plot

ggsave(plot = quadratic_model_plot,
       filename = here("output/Durkheim-TA-Quadratic.pdf"),
       width = 3.5, height = 3.5)

## Examples-LifeExpectancy.R
here::i_am("R/Examples-LifeExpectancy.R")
library(here)
library(dplyr)
library(data.table)
library(ggplot2)
library(scales)
library(modelsummary)
library(kableExtra)
library(icaeDesign)

# Set up data set ---------------------
gdp_life <- DataScienceExercises::gdplifexp2007 %>%
  mutate(GDPpc_log=log(gdpPercap), LifeExp_log=log(lifeExp)) %>%
  filter(continent!="Oceania", continent!="Africa", lifeExp>50)

# Estimate the two models--------------
model_1 <- lm(lifeExp~gdpPercap, data = gdp_life)
model_2 <- lm(LifeExp_log~GDPpc_log, data = gdp_life)

# Compute predictions from models-----------
gdp_life <- gdp_life %>%
  mutate(
    `Model 1` = predict(model_1),
    `Model 2` = predict(model_2)
  )

# Create the visualizations------------
## Plot 1: Level scatter plot with simple model----------
level_plot <- ggplot(
    data = gdp_life,
    aes(x=gdpPercap, y=lifeExp)
    ) +
  geom_point() +
  geom_line(
    aes(x=gdpPercap, y=`Model 1`),
    color=get_euf_colors("red"),
    linewidth=1) +
  labs(
    title = "Income and life expectancy",
    x="GDP per capita (PPP)",
    y="Life expectancy at birth"
    ) +
  scale_x_continuous(
    labels = scales::label_number(scale = 0.001, suffix = "k")
    ) +
  theme_icae()
level_plot

ggsave(
  plot = level_plot,
  filename = here("output/LifeExp_base.pdf"),
  width = 3.5, height = 3.5)

ggsave(
  plot = level_plot +
    geom_line(
      aes(x=gdpPercap, y=exp(`Model 2`)),
      color=get_euf_colors("blue"),
      linewidth=1),
  filename = here("output/LifeExp_TwoModels.pdf"),
  width = 3.5, height = 3.5)

## Plot 2: TA plot for the simple model----------
TA_lin <- tibble(
  "Fitted values"=model_1$fitted.values,
  "Residuals"=model_1$residuals
) %>%
  ggplot(aes(x=`Fitted values`,y=Residuals)) +
  geom_point() +
  theme_icae()

ggsave(
  plot = TA_lin + labs(title = "TA plot (model 1)"),
  filename = here("output/LifeExp_lin_TA.pdf"),
  width = 3.5, height = 2.5)

## Plot 3: Log-log scatter plot with quadratic model----------
log_plot <- ggplot(
    data = gdp_life,
    aes(x=GDPpc_log, y=LifeExp_log)
    ) +
  geom_point() +
  geom_line(
    aes(x=GDPpc_log, y=`Model 2`),
    color=get_euf_colors("blue"),
    linewidth=1) +
  labs(
    title = "Income and life expectancy (log-log)",
    x="GDP per capita (PPP, log)",
    y="Life expectancy at birth (log)"
  ) +
  theme_icae() +
  theme(legend.position = "none")
log_plot

ggsave(
  plot = log_plot,
  filename = here("output/LifeExp_log.pdf"),
  width = 3.5, height = 3.5)

## Plot 4: TA plot for the log-log model---------
TA_log <- tibble(
  "Fitted values"=model_2$fitted.values,
  "Residuals"=model_2$residuals
) %>%
  ggplot(aes(x=`Fitted values`,y=Residuals)) +
  geom_point() +
  theme_icae()

ggsave(
  plot = TA_log + labs(title = "TA plot (model 2)"),
  filename = here("output/LifeExp_log_TA.pdf"),
  width = 3.5, height = 2.5)

## Exercise-1-Beer.R
here::i_am("R/Exercise-1-Beer.R")
library(DataScienceExercises)
library(dplyr)
library(texreg) # To visualize models in a readable way
library(kableExtra) # If you want to save the table as PDF

beer_data <- DataScienceExercises::beer %>%
  dplyr::mutate(income = income/1000)

model_1 <- lm(
  formula = consumption ~ price,
  data=beer_data)
model_2 <- lm(
  formula = consumption ~ price + income,
  data=beer_data)
model_3 <- lm(
  formula = consumption ~ price + price_liquor + price_other + income,
  data=beer_data)
model_4 <- lm(
  formula = consumption ~ price  + price_other + income,
  data=beer_data)

# Print model results nicely on screen:
texreg::screenreg(
  l = list(
    model_1, model_2, model_3, model_4),
  digits = 4)

# Create a nice PDF with the regression table:
modelsummary::modelsummary(
  models = list(model_1, model_2, model_3, model_4),
  gof_map = c("nobs", "r.squared", "adj.r.squared", "rmse"),
  stars = TRUE, output = "kableExtra") %>%
  kableExtra::save_kable(file = "Exercise-1-Beer.pdf")

## JournalExample.R
here::i_am("JournalExample.R")
library(DataScienceExercises)
library(moderndive)
library(dplyr)
library(tidyr)
library(ggplot2)
library(icaeDesign)

# Multiple regression with the journal data set----------------------
journal_data <- DataScienceExercises::econjournals %>%
  dplyr::filter(papers>10, sub_price<5000)

# The parallel slopes model------------
journal_linmod_psm <- lm(sub_price~pages_py+publisher_type, data = journal_data)
get_regression_table(journal_linmod_psm)

journal_linmod_pslopes <- journal_data %>%
  ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE) +
  labs(y="Subscription price", x = "Page length") +
  scale_color_brewer(palette = "Set1", direction = -1) +
  coord_cartesian(xlim = c(0, 2800)) +
  theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
  theme(legend.position = "bottom", legend.title = element_text())
journal_linmod_pslopes

# The interaction model------------

journal_linmod_intct <- lm(sub_price~pages_py*publisher_type, data = journal_data)
get_regression_table(journal_linmod_intct)

journal_linmod_interact <- journal_data %>%
  ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
  geom_point() +
  stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
  labs(y="Subscription price", x = "Page length") +
  scale_color_brewer(palette = "Set1", direction = -1) +
  coord_cartesian(xlim = c(0, 2800)) +
  theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
  theme(legend.position = "bottom", legend.title = element_text())
journal_linmod_interact

journal_linmod_interact_zoom <- journal_linmod_interact +
  coord_cartesian(xlim = c(-10, 500), ylim = c(-10, 500)) +
  stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  theme(legend.position = "none")
journal_linmod_interact_zoom

# Comparing the R-squareds-------------

summary(journal_linmod_intct)[["r.squared"]]
summary(journal_linmod_intct)[["adj.r.squared"]]

summary(journal_linmod_psm)[["r.squared"]]
summary(journal_linmod_psm)[["adj.r.squared"]]

# Compare the residual plots-----------
TA_pslopes <- tibble(
  "Fitted"=journal_linmod_psm$fitted.values,
  "Residuals"=journal_linmod_psm$residuals
) %>%
  ggplot(aes(x=Fitted, y=Residuals)) +
  geom_point() + geom_hline(yintercept = 0) +
  labs(title = "The parallel slopes model") +
  theme_icae()
TA_pslopes

TA_interaction <- tibble(
  "Fitted"=journal_linmod_intct$fitted.values,
  "Residuals"=journal_linmod_intct$residuals
) %>%
  ggplot(aes(x=Fitted, y=Residuals)) +
  geom_point() + geom_hline(yintercept = 0) +
  labs(title = "The interaction model") +
  theme_icae()
TA_interaction

## LifeExpExample.R
here::i_am("LifeExpExample.R")
library(DataScienceExercises)
library(moderndive)
library(dplyr)
library(tidyr)
library(ggplot2)
library(icaeDesign)

life_exp <- DataScienceExercises::gdplifexp2007 %>%
  dplyr::select(all_of(c("continent", "lifeExp", "gdpPercap"))) %>%
  dplyr::mutate(continent=as.factor(continent))

# Categorical variables within the multiple regression case--------------------
cont_gdp_linmod <- lm(lifeExp~gdpPercap+continent, data = life_exp)
get_regression_table(cont_gdp_linmod)

life_exp_scaled <- life_exp %>%
  dplyr::mutate(
    gdpPercap = gdpPercap/1000
  )

# Easier to read:
cont_gdp_linmod_sc <- lm(lifeExp~gdpPercap+continent, data = life_exp_scaled)
get_regression_table(cont_gdp_linmod_sc) %>%
  dplyr::select(term, estimate, std_error)

# Linear mode:
life_model_lm <- lm(lifeExp~continent, data = life_exp_scaled)
summary(life_model_lm)

# Get same result via data wrangling and manipulation:
MeanAfrica <- life_exp_scaled %>%
  filter(continent=="Africa") %>%
  summarize(lifeExp=mean(lifeExp)) %>%
  pull(lifeExp)

life_exp_scaled %>%
  summarise(MeanLifeExp = mean(lifeExp), .by = "continent") %>%
  mutate(DifferenceAfrica = MeanLifeExp - MeanAfrica)

## ols-distribution.R
here::i_am("ols-distribution.R")
library(here)
library(tidyr)
library(data.table)
library(dplyr)
library(ggplot2)
library(scales)
library(purrr)
library(icaeDesign)
library(ggpubr)
library(latex2exp)

# True population----------------------
set.seed(123)
pop_size <- 5000
beta_0_pop <- 1000
beta_1_pop <- 3.5

labor_input <- runif(n = pop_size, min = 50, max = 500)
random_error <- rnorm(n = pop_size, mean = 0, sd = 100)

firm_output <- beta_0_pop + beta_1_pop * labor_input + random_error

pop_data <- tibble(
  "output" = firm_output,
  "labor" = labor_input
)

output_dist_pop <- ggplot(data = pop_data, mapping = aes(x=output)) +
  geom_histogram(alpha=0.5, binwidth = 50) +
  scale_y_continuous(expand = expansion(add = c(0, 20))) +
  labs(title = "Distribution of output",  x = "Firm output", y = "Number of firms") +
  theme_icae()

relationship_pop <- ggplot(
  data = pop_data, mapping = aes(x = labor, y=output)) +
  geom_point(alpha=0.25, color="white", shape=21, fill="black") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "True relationship",  x = "Labor input", y = "Firm output") +
  theme_icae()
relationship_pop

pop_overview <- ggarrange(output_dist_pop, relationship_pop, ncol = 2)
pop_overview <- annotate_figure(pop_overview, top = "The simulated population")
ggsave(
  plot = pop_overview,
  filename = here("output/pop_overview.pdf"),
  width = 7, height = 3)

# Estimation----------------------
set.seed(1234)
iterations <- 1000
sample_size <- 150

get_estimates <- function(s_size){
  sample_drawn <- slice_sample(pop_data, n = sample_size)
  estimated_model <- lm(formula = output~labor, data = sample_drawn)
  coefs <- tibble(
    "beta_0"=estimated_model$coefficients["(Intercept)"],
    "beta_1"=estimated_model$coefficients["labor"])
}

estimates <- map(
  .x = seq_len(iterations),
  .f = ~ get_estimates(s_size = sample_size))
estimates_tib <- list_rbind(estimates)

# Visualizaiton----------------------------------
beta_0_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_0)) +
  geom_histogram(binwidth = 5, fill = get_euf_colors("blue")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = TeX("$\\beta_0=1000$"),
    x = TeX("$\\hat{\\beta}_0$"),
    y = "Frequency"
  ) +
  geom_vline(xintercept = beta_0_pop) +
  theme_icae() +
  theme(plot.title = element_text(size=12))

beta_1_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_1)) +
  geom_histogram(binwidth = 0.01, fill = get_euf_colors("blue")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = TeX("$\\beta_1=3.5$"),
    x = TeX("$\\hat{\\beta}_1$"),
    y = "Frequency"
    ) +
  geom_vline(xintercept = beta_1_pop) +
  theme_icae() +
  theme(plot.title = element_text(size=12))

estimations_figure <- ggarrange(beta_0_dist, beta_1_dist, ncol = 2)
estimations_figure <- annotate_figure(
  p = estimations_figure,
  top = "Sampling distributions of the estimated parameters")
ggsave(
  plot = estimations_figure,
  filename = here("output/estimations_figure.pdf"),
  width = 7, height = 3)

# Quantitative result-------------------
library(kableExtra)
estimates_tib %>%
  pivot_longer(
    cols = everything(),
    names_to = "Parameter",
    values_to = "Value") %>%
  summarise(
    Mean = mean(Value),
    SD = sd(Value), .by = "Parameter"
  ) %>%
  kable(digits = 3, format = "latex", booktabs=TRUE) %>%
  kable_styling() %>%
  save_kable(file = here("output/estimations_table.pdf"), bs_theme = "flatly")

set.seed(1234)

expl_model <- lm(
  formula = output~labor,
  data = slice_sample(pop_data, n = sample_size))

expl_model_summary <- summary(expl_model)
b1_est <- expl_model_summary$coefficients[2,1]
b1_sd <- expl_model_summary$coefficients[2,2]
lower_ci <- b1_est - (1.96*b1_sd)
upper_ci <- b1_est + (1.96*b1_sd)
c(lower_ci,upper_ci)

moderndive::get_regression_table(
  model = expl_model,
  digits = 2) %>%
  kable(digits = 3, format = "latex", booktabs=TRUE) %>%
  kable_styling() %>%
  save_kable(file = here("output/lm_table.pdf"), bs_theme = "flatly")


## wine-recap.R
here::i_am("wine-recap.R")
library(DataScienceExercises)
library(dplyr)
library(tidyr)

wine_data <- DataScienceExercises::wine2dine

wine_reg <- lm(`residual sugar` ~ kind, data = wine_data)

summary(wine_reg)

wine_data %>%
  summarise(MeanResidualSugar = mean(`residual sugar`), .by = "kind") %>%
  pivot_wider(names_from = "kind", values_from = "MeanResidualSugar") %>%
  mutate(DifferenceRed = white - red)
	Contains solutions to the exercises as well as the code used to create the relevant figures in the slides.
	The code assumes a standard folder structure and might need to be adjusted to your case.
	here::i_am("R/Examples-Durkheim.R")
	library(here)
	library(dplyr)
	library(data.table)
	library(ggplot2)
	library(modelsummary)
	library(kableExtra)
	library(icaeDesign)

	# Create the artificial Durkheim data set--------
	set.seed(123)
	n_obs <- 1000
	social_cohesion <- seq(-0.5, 0.5, length.out = n_obs)
	p_suicide <- social_cohesion**2 + rnorm(n_obs, mean = 0, sd = 0.005)

	suicide_data <- tibble(
	`Social cohesion`=social_cohesion,
	`Probability for suicide`=p_suicide
	) %>%
	dplyr::mutate(
	Society = ifelse(`Social cohesion`<=0, "Modern", "Archaic"),
	`Social cohesion squared` = `Social cohesion`**2
	)

	data.table::fwrite(
	x = suicide_data, file = here::here("data/suicide_data.csv"))

	# Estimate the two models--------------
	linmod <- lm(
	formula = `Probability for suicide`~`Social cohesion`,
	data = suicide_data)
	quadmod <- lm(
	formula = `Probability for suicide`~`Social cohesion`+I(`Social cohesion`**2),
	data = suicide_data)

	# Create a PDF of the regression table:
	modelsummary(
	list("Model 1"=linmod, "Model 2"=quadmod),
	output = "kableExtra", gof_map = c("r.squared"),
	statistic = NULL) %>%
	kableExtra::save_kable(
	file = here("output/RegressionTable-Durkheim.pdf"))

	# Add the predictions of the models to the data
	suicide_data_predictions <- suicide_data %>%
	mutate(
	`Model 1` = predict(linmod),
	`Model 2` = predict(quadmod)
	) %>%
	select(`Social cohesion`, `Model 1`, `Model 2`) %>%
	tidyr::pivot_longer(
	cols = c("Model 1", "Model 2"),
	names_to = "Model",
	values_to = "Prediction")

	# Create the three plots---------------
	## Plot 1: the data--------------------
	suicide_data_plot <- ggplot(
	data = suicide_data,
	mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
	) +
	geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
	labs(title = "Suicides a la Durkheim") +
	theme_icae()
	suicide_data_plot

	ggsave(plot = suicide_data_plot,
	filename = here("output/DurkheimData.pdf"),
	width = 3.5, height = 3.5)

	## Plot 2: the data with cohesion squared--------------------
	suicide_data_plot_squared <- ggplot(
	data = suicide_data,
	mapping = aes(x=`Social cohesion squared`, y=`Probability for suicide`)
	) +
	geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
	labs(title = "Suicides a la Durkheim") +
	theme_icae()
	suicide_data_plot_squared

	ggsave(plot = suicide_data_plot_squared,
	filename = here("output/DurkheimData_squared.pdf"),
	width = 3.5, height = 3.5)

	## Plot 3: the data with cohesion squared--------------------
	suicide_data_plot_predict_plot <- ggplot(
	data = suicide_data,
	mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
	) +
	geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
	geom_line(
	data = suicide_data_predictions,
	mapping = aes(x=`Social cohesion`, y=Prediction, color=Model), linewidth=1
	) +
	scale_color_euf(palette = "mixed") +
	labs(title = "Suicides a la Durkheim") +
	theme_icae() +
	theme(legend.position = "bottom")
	suicide_data_plot_predict_plot

	ggsave(plot = suicide_data_plot_predict_plot,
	filename = here("output/Durkheim-Predictions.pdf"),
	width = 3.5, height = 3.5)

	## Tukey-Anscombe plots-------------------
	simple_model_TA_plot <- tibble(
	"Fitted values" = linmod$fitted.values,
	"Residuals" = linmod$residuals
	) %>%
	ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
	geom_point(alpha=0.25) +
	labs(title = "TA plot (model 1)") +
	theme_icae()
	simple_model_TA_plot

	ggsave(plot = simple_model_TA_plot,
	filename = here("output/Durkheim-TA-Simple.pdf"),
	width = 3.5, height = 3.5)

	quadratic_model_plot <- tibble(
	"Fitted values" = quadmod$fitted.values,
	"Residuals" = quadmod$residuals
	) %>%
	ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
	geom_point(alpha=0.25) +
	labs(title = "TA plot (model 2)") +
	theme_icae()
	quadratic_model_plot

	ggsave(plot = quadratic_model_plot,
	filename = here("output/Durkheim-TA-Quadratic.pdf"),
	width = 3.5, height = 3.5)
	here::i_am("R/Examples-LifeExpectancy.R")
	library(here)
	library(dplyr)
	library(data.table)
	library(ggplot2)
	library(scales)
	library(modelsummary)
	library(kableExtra)
	library(icaeDesign)

	# Set up data set ---------------------
	gdp_life <- DataScienceExercises::gdplifexp2007 %>%
	mutate(GDPpc_log=log(gdpPercap), LifeExp_log=log(lifeExp)) %>%
	filter(continent!="Oceania", continent!="Africa", lifeExp>50)

	# Estimate the two models--------------
	model_1 <- lm(lifeExp~gdpPercap, data = gdp_life)
	model_2 <- lm(LifeExp_log~GDPpc_log, data = gdp_life)

	# Compute predictions from models-----------
	gdp_life <- gdp_life %>%
	mutate(
	`Model 1` = predict(model_1),
	`Model 2` = predict(model_2)
	)

	# Create the visualizations------------
	## Plot 1: Level scatter plot with simple model----------
	level_plot <- ggplot(
	data = gdp_life,
	aes(x=gdpPercap, y=lifeExp)
	) +
	geom_point() +
	geom_line(
	aes(x=gdpPercap, y=`Model 1`),
	color=get_euf_colors("red"),
	linewidth=1) +
	labs(
	title = "Income and life expectancy",
	x="GDP per capita (PPP)",
	y="Life expectancy at birth"
	) +
	scale_x_continuous(
	labels = scales::label_number(scale = 0.001, suffix = "k")
	) +
	theme_icae()
	level_plot

	ggsave(
	plot = level_plot,
	filename = here("output/LifeExp_base.pdf"),
	width = 3.5, height = 3.5)

	ggsave(
	plot = level_plot +
	geom_line(
	aes(x=gdpPercap, y=exp(`Model 2`)),
	color=get_euf_colors("blue"),
	linewidth=1),
	filename = here("output/LifeExp_TwoModels.pdf"),
	width = 3.5, height = 3.5)

	## Plot 2: TA plot for the simple model----------
	TA_lin <- tibble(
	"Fitted values"=model_1$fitted.values,
	"Residuals"=model_1$residuals
	) %>%
	ggplot(aes(x=`Fitted values`,y=Residuals)) +
	geom_point() +
	theme_icae()

	ggsave(
	plot = TA_lin + labs(title = "TA plot (model 1)"),
	filename = here("output/LifeExp_lin_TA.pdf"),
	width = 3.5, height = 2.5)

	## Plot 3: Log-log scatter plot with quadratic model----------
	log_plot <- ggplot(
	data = gdp_life,
	aes(x=GDPpc_log, y=LifeExp_log)
	) +
	geom_point() +
	geom_line(
	aes(x=GDPpc_log, y=`Model 2`),
	color=get_euf_colors("blue"),
	linewidth=1) +
	labs(
	title = "Income and life expectancy (log-log)",
	x="GDP per capita (PPP, log)",
	y="Life expectancy at birth (log)"
	) +
	theme_icae() +
	theme(legend.position = "none")
	log_plot

	ggsave(
	plot = log_plot,
	filename = here("output/LifeExp_log.pdf"),
	width = 3.5, height = 3.5)

	## Plot 4: TA plot for the log-log model---------
	TA_log <- tibble(
	"Fitted values"=model_2$fitted.values,
	"Residuals"=model_2$residuals
	) %>%
	ggplot(aes(x=`Fitted values`,y=Residuals)) +
	geom_point() +
	theme_icae()

	ggsave(
	plot = TA_log + labs(title = "TA plot (model 2)"),
	filename = here("output/LifeExp_log_TA.pdf"),
	width = 3.5, height = 2.5)
	here::i_am("R/Exercise-1-Beer.R")
	library(DataScienceExercises)
	library(dplyr)
	library(texreg) # To visualize models in a readable way
	library(kableExtra) # If you want to save the table as PDF

	beer_data <- DataScienceExercises::beer %>%
	dplyr::mutate(income = income/1000)

	model_1 <- lm(
	formula = consumption ~ price,
	data=beer_data)
	model_2 <- lm(
	formula = consumption ~ price + income,
	data=beer_data)
	model_3 <- lm(
	formula = consumption ~ price + price_liquor + price_other + income,
	data=beer_data)
	model_4 <- lm(
	formula = consumption ~ price + price_other + income,
	data=beer_data)

	# Print model results nicely on screen:
	texreg::screenreg(
	l = list(
	model_1, model_2, model_3, model_4),
	digits = 4)

	# Create a nice PDF with the regression table:
	modelsummary::modelsummary(
	models = list(model_1, model_2, model_3, model_4),
	gof_map = c("nobs", "r.squared", "adj.r.squared", "rmse"),
	stars = TRUE, output = "kableExtra") %>%
	kableExtra::save_kable(file = "Exercise-1-Beer.pdf")
	here::i_am("JournalExample.R")
	library(DataScienceExercises)
	library(moderndive)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(icaeDesign)

	# Multiple regression with the journal data set----------------------
	journal_data <- DataScienceExercises::econjournals %>%
	dplyr::filter(papers>10, sub_price<5000)

	# The parallel slopes model------------
	journal_linmod_psm <- lm(sub_price~pages_py+publisher_type, data = journal_data)
	get_regression_table(journal_linmod_psm)

	journal_linmod_pslopes <- journal_data %>%
	ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
	geom_point() +
	geom_parallel_slopes(se = FALSE) +
	labs(y="Subscription price", x = "Page length") +
	scale_color_brewer(palette = "Set1", direction = -1) +
	coord_cartesian(xlim = c(0, 2800)) +
	theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
	theme(legend.position = "bottom", legend.title = element_text())
	journal_linmod_pslopes

	# The interaction model------------

	journal_linmod_intct <- lm(sub_price~pages_py*publisher_type, data = journal_data)
	get_regression_table(journal_linmod_intct)

	journal_linmod_interact <- journal_data %>%
	ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
	geom_point() +
	stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
	labs(y="Subscription price", x = "Page length") +
	scale_color_brewer(palette = "Set1", direction = -1) +
	coord_cartesian(xlim = c(0, 2800)) +
	theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
	theme(legend.position = "bottom", legend.title = element_text())
	journal_linmod_interact

	journal_linmod_interact_zoom <- journal_linmod_interact +
	coord_cartesian(xlim = c(-10, 500), ylim = c(-10, 500)) +
	stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
	geom_hline(yintercept = 0) +
	geom_vline(xintercept = 0) +
	theme(legend.position = "none")
	journal_linmod_interact_zoom

	# Comparing the R-squareds-------------

	summary(journal_linmod_intct)[["r.squared"]]
	summary(journal_linmod_intct)[["adj.r.squared"]]

	summary(journal_linmod_psm)[["r.squared"]]
	summary(journal_linmod_psm)[["adj.r.squared"]]

	# Compare the residual plots-----------
	TA_pslopes <- tibble(
	"Fitted"=journal_linmod_psm$fitted.values,
	"Residuals"=journal_linmod_psm$residuals
	) %>%
	ggplot(aes(x=Fitted, y=Residuals)) +
	geom_point() + geom_hline(yintercept = 0) +
	labs(title = "The parallel slopes model") +
	theme_icae()
	TA_pslopes

	TA_interaction <- tibble(
	"Fitted"=journal_linmod_intct$fitted.values,
	"Residuals"=journal_linmod_intct$residuals
	) %>%
	ggplot(aes(x=Fitted, y=Residuals)) +
	geom_point() + geom_hline(yintercept = 0) +
	labs(title = "The interaction model") +
	theme_icae()
	TA_interaction
	here::i_am("LifeExpExample.R")
	library(DataScienceExercises)
	library(moderndive)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(icaeDesign)

	life_exp <- DataScienceExercises::gdplifexp2007 %>%
	dplyr::select(all_of(c("continent", "lifeExp", "gdpPercap"))) %>%
	dplyr::mutate(continent=as.factor(continent))

	# Categorical variables within the multiple regression case--------------------
	cont_gdp_linmod <- lm(lifeExp~gdpPercap+continent, data = life_exp)
	get_regression_table(cont_gdp_linmod)

	life_exp_scaled <- life_exp %>%
	dplyr::mutate(
	gdpPercap = gdpPercap/1000
	)

	# Easier to read:
	cont_gdp_linmod_sc <- lm(lifeExp~gdpPercap+continent, data = life_exp_scaled)
	get_regression_table(cont_gdp_linmod_sc) %>%
	dplyr::select(term, estimate, std_error)

	# Linear mode:
	life_model_lm <- lm(lifeExp~continent, data = life_exp_scaled)
	summary(life_model_lm)

	# Get same result via data wrangling and manipulation:
	MeanAfrica <- life_exp_scaled %>%
	filter(continent=="Africa") %>%
	summarize(lifeExp=mean(lifeExp)) %>%
	pull(lifeExp)

	life_exp_scaled %>%
	summarise(MeanLifeExp = mean(lifeExp), .by = "continent") %>%
	mutate(DifferenceAfrica = MeanLifeExp - MeanAfrica)
	here::i_am("ols-distribution.R")
	library(here)
	library(tidyr)
	library(data.table)
	library(dplyr)
	library(ggplot2)
	library(scales)
	library(purrr)
	library(icaeDesign)
	library(ggpubr)
	library(latex2exp)

	# True population----------------------
	set.seed(123)
	pop_size <- 5000
	beta_0_pop <- 1000
	beta_1_pop <- 3.5

	labor_input <- runif(n = pop_size, min = 50, max = 500)
	random_error <- rnorm(n = pop_size, mean = 0, sd = 100)

	firm_output <- beta_0_pop + beta_1_pop * labor_input + random_error

	pop_data <- tibble(
	"output" = firm_output,
	"labor" = labor_input
	)

	output_dist_pop <- ggplot(data = pop_data, mapping = aes(x=output)) +
	geom_histogram(alpha=0.5, binwidth = 50) +
	scale_y_continuous(expand = expansion(add = c(0, 20))) +
	labs(title = "Distribution of output", x = "Firm output", y = "Number of firms") +
	theme_icae()

	relationship_pop <- ggplot(
	data = pop_data, mapping = aes(x = labor, y=output)) +
	geom_point(alpha=0.25, color="white", shape=21, fill="black") +
	geom_smooth(method = "lm", se = FALSE) +
	labs(title = "True relationship", x = "Labor input", y = "Firm output") +
	theme_icae()
	relationship_pop

	pop_overview <- ggarrange(output_dist_pop, relationship_pop, ncol = 2)
	pop_overview <- annotate_figure(pop_overview, top = "The simulated population")
	ggsave(
	plot = pop_overview,
	filename = here("output/pop_overview.pdf"),
	width = 7, height = 3)

	# Estimation----------------------
	set.seed(1234)
	iterations <- 1000
	sample_size <- 150

	get_estimates <- function(s_size){
	sample_drawn <- slice_sample(pop_data, n = sample_size)
	estimated_model <- lm(formula = output~labor, data = sample_drawn)
	coefs <- tibble(
	"beta_0"=estimated_model$coefficients["(Intercept)"],
	"beta_1"=estimated_model$coefficients["labor"])
	}

	estimates <- map(
	.x = seq_len(iterations),
	.f = ~ get_estimates(s_size = sample_size))
	estimates_tib <- list_rbind(estimates)

	# Visualizaiton----------------------------------
	beta_0_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_0)) +
	geom_histogram(binwidth = 5, fill = get_euf_colors("blue")) +
	scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
	labs(
	title = TeX("$\\beta_0=1000$"),
	x = TeX("$\\hat{\\beta}_0$"),
	y = "Frequency"
	) +
	geom_vline(xintercept = beta_0_pop) +
	theme_icae() +
	theme(plot.title = element_text(size=12))

	beta_1_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_1)) +
	geom_histogram(binwidth = 0.01, fill = get_euf_colors("blue")) +
	scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
	labs(
	title = TeX("$\\beta_1=3.5$"),
	x = TeX("$\\hat{\\beta}_1$"),
	y = "Frequency"
	) +
	geom_vline(xintercept = beta_1_pop) +
	theme_icae() +
	theme(plot.title = element_text(size=12))

	estimations_figure <- ggarrange(beta_0_dist, beta_1_dist, ncol = 2)
	estimations_figure <- annotate_figure(
	p = estimations_figure,
	top = "Sampling distributions of the estimated parameters")
	ggsave(
	plot = estimations_figure,
	filename = here("output/estimations_figure.pdf"),
	width = 7, height = 3)

	# Quantitative result-------------------
	library(kableExtra)
	estimates_tib %>%
	pivot_longer(
	cols = everything(),
	names_to = "Parameter",
	values_to = "Value") %>%
	summarise(
	Mean = mean(Value),
	SD = sd(Value), .by = "Parameter"
	) %>%
	kable(digits = 3, format = "latex", booktabs=TRUE) %>%
	kable_styling() %>%
	save_kable(file = here("output/estimations_table.pdf"), bs_theme = "flatly")

	set.seed(1234)

	expl_model <- lm(
	formula = output~labor,
	data = slice_sample(pop_data, n = sample_size))

	expl_model_summary <- summary(expl_model)
	b1_est <- expl_model_summary$coefficients[2,1]
	b1_sd <- expl_model_summary$coefficients[2,2]
	lower_ci <- b1_est - (1.96*b1_sd)
	upper_ci <- b1_est + (1.96*b1_sd)
	c(lower_ci,upper_ci)

	moderndive::get_regression_table(
	model = expl_model,
	digits = 2) %>%
	kable(digits = 3, format = "latex", booktabs=TRUE) %>%
	kable_styling() %>%
	save_kable(file = here("output/lm_table.pdf"), bs_theme = "flatly")
	here::i_am("wine-recap.R")
	library(DataScienceExercises)
	library(dplyr)
	library(tidyr)

	wine_data <- DataScienceExercises::wine2dine

	wine_reg <- lm(`residual sugar` ~ kind, data = wine_data)

	summary(wine_reg)

	wine_data %>%
	summarise(MeanResidualSugar = mean(`residual sugar`), .by = "kind") %>%
	pivot_wider(names_from = "kind", values_from = "MeanResidualSugar") %>%
	mutate(DifferenceRed = white - red)