Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active June 25, 2023 22:58
Show Gist options
  • Save graebnerc/2696373159c2853ef28d1bc60d529dc8 to your computer and use it in GitHub Desktop.
Save graebnerc/2696373159c2853ef28d1bc60d529dc8 to your computer and use it in GitHub Desktop.
The examples used during session 17 and 18 on multiple linear regression, as well as possible solutions to the exercises.
Contains solutions to the exercises as well as the code used to create the relevant figures in the slides.
The code assumes a standard folder structure and might need to be adjusted to your case.
here::i_am("R/Examples-Durkheim.R")
library(here)
library(dplyr)
library(data.table)
library(ggplot2)
library(modelsummary)
library(kableExtra)
library(icaeDesign)
# Create the artificial Durkheim data set--------
set.seed(123)
n_obs <- 1000
social_cohesion <- seq(-0.5, 0.5, length.out = n_obs)
p_suicide <- social_cohesion**2 + rnorm(n_obs, mean = 0, sd = 0.005)
suicide_data <- tibble(
`Social cohesion`=social_cohesion,
`Probability for suicide`=p_suicide
) %>%
dplyr::mutate(
Society = ifelse(`Social cohesion`<=0, "Modern", "Archaic"),
`Social cohesion squared` = `Social cohesion`**2
)
data.table::fwrite(
x = suicide_data, file = here::here("data/suicide_data.csv"))
# Estimate the two models--------------
linmod <- lm(
formula = `Probability for suicide`~`Social cohesion`,
data = suicide_data)
quadmod <- lm(
formula = `Probability for suicide`~`Social cohesion`+I(`Social cohesion`**2),
data = suicide_data)
# Create a PDF of the regression table:
modelsummary(
list("Model 1"=linmod, "Model 2"=quadmod),
output = "kableExtra", gof_map = c("r.squared"),
statistic = NULL) %>%
kableExtra::save_kable(
file = here("output/RegressionTable-Durkheim.pdf"))
# Add the predictions of the models to the data
suicide_data_predictions <- suicide_data %>%
mutate(
`Model 1` = predict(linmod),
`Model 2` = predict(quadmod)
) %>%
select(`Social cohesion`, `Model 1`, `Model 2`) %>%
tidyr::pivot_longer(
cols = c("Model 1", "Model 2"),
names_to = "Model",
values_to = "Prediction")
# Create the three plots---------------
## Plot 1: the data--------------------
suicide_data_plot <- ggplot(
data = suicide_data,
mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
) +
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
labs(title = "Suicides a la Durkheim") +
theme_icae()
suicide_data_plot
ggsave(plot = suicide_data_plot,
filename = here("output/DurkheimData.pdf"),
width = 3.5, height = 3.5)
## Plot 2: the data with cohesion squared--------------------
suicide_data_plot_squared <- ggplot(
data = suicide_data,
mapping = aes(x=`Social cohesion squared`, y=`Probability for suicide`)
) +
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
labs(title = "Suicides a la Durkheim") +
theme_icae()
suicide_data_plot_squared
ggsave(plot = suicide_data_plot_squared,
filename = here("output/DurkheimData_squared.pdf"),
width = 3.5, height = 3.5)
## Plot 3: the data with cohesion squared--------------------
suicide_data_plot_predict_plot <- ggplot(
data = suicide_data,
mapping = aes(x=`Social cohesion`, y=`Probability for suicide`)
) +
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) +
geom_line(
data = suicide_data_predictions,
mapping = aes(x=`Social cohesion`, y=Prediction, color=Model), linewidth=1
) +
scale_color_euf(palette = "mixed") +
labs(title = "Suicides a la Durkheim") +
theme_icae() +
theme(legend.position = "bottom")
suicide_data_plot_predict_plot
ggsave(plot = suicide_data_plot_predict_plot,
filename = here("output/Durkheim-Predictions.pdf"),
width = 3.5, height = 3.5)
## Tukey-Anscombe plots-------------------
simple_model_TA_plot <- tibble(
"Fitted values" = linmod$fitted.values,
"Residuals" = linmod$residuals
) %>%
ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
geom_point(alpha=0.25) +
labs(title = "TA plot (model 1)") +
theme_icae()
simple_model_TA_plot
ggsave(plot = simple_model_TA_plot,
filename = here("output/Durkheim-TA-Simple.pdf"),
width = 3.5, height = 3.5)
quadratic_model_plot <- tibble(
"Fitted values" = quadmod$fitted.values,
"Residuals" = quadmod$residuals
) %>%
ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) +
geom_point(alpha=0.25) +
labs(title = "TA plot (model 2)") +
theme_icae()
quadratic_model_plot
ggsave(plot = quadratic_model_plot,
filename = here("output/Durkheim-TA-Quadratic.pdf"),
width = 3.5, height = 3.5)
here::i_am("R/Examples-LifeExpectancy.R")
library(here)
library(dplyr)
library(data.table)
library(ggplot2)
library(scales)
library(modelsummary)
library(kableExtra)
library(icaeDesign)
# Set up data set ---------------------
gdp_life <- DataScienceExercises::gdplifexp2007 %>%
mutate(GDPpc_log=log(gdpPercap), LifeExp_log=log(lifeExp)) %>%
filter(continent!="Oceania", continent!="Africa", lifeExp>50)
# Estimate the two models--------------
model_1 <- lm(lifeExp~gdpPercap, data = gdp_life)
model_2 <- lm(LifeExp_log~GDPpc_log, data = gdp_life)
# Compute predictions from models-----------
gdp_life <- gdp_life %>%
mutate(
`Model 1` = predict(model_1),
`Model 2` = predict(model_2)
)
# Create the visualizations------------
## Plot 1: Level scatter plot with simple model----------
level_plot <- ggplot(
data = gdp_life,
aes(x=gdpPercap, y=lifeExp)
) +
geom_point() +
geom_line(
aes(x=gdpPercap, y=`Model 1`),
color=get_euf_colors("red"),
linewidth=1) +
labs(
title = "Income and life expectancy",
x="GDP per capita (PPP)",
y="Life expectancy at birth"
) +
scale_x_continuous(
labels = scales::label_number(scale = 0.001, suffix = "k")
) +
theme_icae()
level_plot
ggsave(
plot = level_plot,
filename = here("output/LifeExp_base.pdf"),
width = 3.5, height = 3.5)
ggsave(
plot = level_plot +
geom_line(
aes(x=gdpPercap, y=exp(`Model 2`)),
color=get_euf_colors("blue"),
linewidth=1),
filename = here("output/LifeExp_TwoModels.pdf"),
width = 3.5, height = 3.5)
## Plot 2: TA plot for the simple model----------
TA_lin <- tibble(
"Fitted values"=model_1$fitted.values,
"Residuals"=model_1$residuals
) %>%
ggplot(aes(x=`Fitted values`,y=Residuals)) +
geom_point() +
theme_icae()
ggsave(
plot = TA_lin + labs(title = "TA plot (model 1)"),
filename = here("output/LifeExp_lin_TA.pdf"),
width = 3.5, height = 2.5)
## Plot 3: Log-log scatter plot with quadratic model----------
log_plot <- ggplot(
data = gdp_life,
aes(x=GDPpc_log, y=LifeExp_log)
) +
geom_point() +
geom_line(
aes(x=GDPpc_log, y=`Model 2`),
color=get_euf_colors("blue"),
linewidth=1) +
labs(
title = "Income and life expectancy (log-log)",
x="GDP per capita (PPP, log)",
y="Life expectancy at birth (log)"
) +
theme_icae() +
theme(legend.position = "none")
log_plot
ggsave(
plot = log_plot,
filename = here("output/LifeExp_log.pdf"),
width = 3.5, height = 3.5)
## Plot 4: TA plot for the log-log model---------
TA_log <- tibble(
"Fitted values"=model_2$fitted.values,
"Residuals"=model_2$residuals
) %>%
ggplot(aes(x=`Fitted values`,y=Residuals)) +
geom_point() +
theme_icae()
ggsave(
plot = TA_log + labs(title = "TA plot (model 2)"),
filename = here("output/LifeExp_log_TA.pdf"),
width = 3.5, height = 2.5)
here::i_am("R/Exercise-1-Beer.R")
library(DataScienceExercises)
library(dplyr)
library(texreg) # To visualize models in a readable way
library(kableExtra) # If you want to save the table as PDF
beer_data <- DataScienceExercises::beer %>%
dplyr::mutate(income = income/1000)
model_1 <- lm(
formula = consumption ~ price,
data=beer_data)
model_2 <- lm(
formula = consumption ~ price + income,
data=beer_data)
model_3 <- lm(
formula = consumption ~ price + price_liquor + price_other + income,
data=beer_data)
model_4 <- lm(
formula = consumption ~ price + price_other + income,
data=beer_data)
# Print model results nicely on screen:
texreg::screenreg(
l = list(
model_1, model_2, model_3, model_4),
digits = 4)
# Create a nice PDF with the regression table:
modelsummary::modelsummary(
models = list(model_1, model_2, model_3, model_4),
gof_map = c("nobs", "r.squared", "adj.r.squared", "rmse"),
stars = TRUE, output = "kableExtra") %>%
kableExtra::save_kable(file = "Exercise-1-Beer.pdf")
here::i_am("JournalExample.R")
library(DataScienceExercises)
library(moderndive)
library(dplyr)
library(tidyr)
library(ggplot2)
library(icaeDesign)
# Multiple regression with the journal data set----------------------
journal_data <- DataScienceExercises::econjournals %>%
dplyr::filter(papers>10, sub_price<5000)
# The parallel slopes model------------
journal_linmod_psm <- lm(sub_price~pages_py+publisher_type, data = journal_data)
get_regression_table(journal_linmod_psm)
journal_linmod_pslopes <- journal_data %>%
ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
geom_point() +
geom_parallel_slopes(se = FALSE) +
labs(y="Subscription price", x = "Page length") +
scale_color_brewer(palette = "Set1", direction = -1) +
coord_cartesian(xlim = c(0, 2800)) +
theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
theme(legend.position = "bottom", legend.title = element_text())
journal_linmod_pslopes
# The interaction model------------
journal_linmod_intct <- lm(sub_price~pages_py*publisher_type, data = journal_data)
get_regression_table(journal_linmod_intct)
journal_linmod_interact <- journal_data %>%
ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) +
geom_point() +
stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
labs(y="Subscription price", x = "Page length") +
scale_color_brewer(palette = "Set1", direction = -1) +
coord_cartesian(xlim = c(0, 2800)) +
theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) +
theme(legend.position = "bottom", legend.title = element_text())
journal_linmod_interact
journal_linmod_interact_zoom <- journal_linmod_interact +
coord_cartesian(xlim = c(-10, 500), ylim = c(-10, 500)) +
stat_smooth(method="lm", fullrange=TRUE, se=FALSE) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
theme(legend.position = "none")
journal_linmod_interact_zoom
# Comparing the R-squareds-------------
summary(journal_linmod_intct)[["r.squared"]]
summary(journal_linmod_intct)[["adj.r.squared"]]
summary(journal_linmod_psm)[["r.squared"]]
summary(journal_linmod_psm)[["adj.r.squared"]]
# Compare the residual plots-----------
TA_pslopes <- tibble(
"Fitted"=journal_linmod_psm$fitted.values,
"Residuals"=journal_linmod_psm$residuals
) %>%
ggplot(aes(x=Fitted, y=Residuals)) +
geom_point() + geom_hline(yintercept = 0) +
labs(title = "The parallel slopes model") +
theme_icae()
TA_pslopes
TA_interaction <- tibble(
"Fitted"=journal_linmod_intct$fitted.values,
"Residuals"=journal_linmod_intct$residuals
) %>%
ggplot(aes(x=Fitted, y=Residuals)) +
geom_point() + geom_hline(yintercept = 0) +
labs(title = "The interaction model") +
theme_icae()
TA_interaction
here::i_am("LifeExpExample.R")
library(DataScienceExercises)
library(moderndive)
library(dplyr)
library(tidyr)
library(ggplot2)
library(icaeDesign)
life_exp <- DataScienceExercises::gdplifexp2007 %>%
dplyr::select(all_of(c("continent", "lifeExp", "gdpPercap"))) %>%
dplyr::mutate(continent=as.factor(continent))
# Categorical variables within the multiple regression case--------------------
cont_gdp_linmod <- lm(lifeExp~gdpPercap+continent, data = life_exp)
get_regression_table(cont_gdp_linmod)
life_exp_scaled <- life_exp %>%
dplyr::mutate(
gdpPercap = gdpPercap/1000
)
# Easier to read:
cont_gdp_linmod_sc <- lm(lifeExp~gdpPercap+continent, data = life_exp_scaled)
get_regression_table(cont_gdp_linmod_sc) %>%
dplyr::select(term, estimate, std_error)
# Linear mode:
life_model_lm <- lm(lifeExp~continent, data = life_exp_scaled)
summary(life_model_lm)
# Get same result via data wrangling and manipulation:
MeanAfrica <- life_exp_scaled %>%
filter(continent=="Africa") %>%
summarize(lifeExp=mean(lifeExp)) %>%
pull(lifeExp)
life_exp_scaled %>%
summarise(MeanLifeExp = mean(lifeExp), .by = "continent") %>%
mutate(DifferenceAfrica = MeanLifeExp - MeanAfrica)
here::i_am("ols-distribution.R")
library(here)
library(tidyr)
library(data.table)
library(dplyr)
library(ggplot2)
library(scales)
library(purrr)
library(icaeDesign)
library(ggpubr)
library(latex2exp)
# True population----------------------
set.seed(123)
pop_size <- 5000
beta_0_pop <- 1000
beta_1_pop <- 3.5
labor_input <- runif(n = pop_size, min = 50, max = 500)
random_error <- rnorm(n = pop_size, mean = 0, sd = 100)
firm_output <- beta_0_pop + beta_1_pop * labor_input + random_error
pop_data <- tibble(
"output" = firm_output,
"labor" = labor_input
)
output_dist_pop <- ggplot(data = pop_data, mapping = aes(x=output)) +
geom_histogram(alpha=0.5, binwidth = 50) +
scale_y_continuous(expand = expansion(add = c(0, 20))) +
labs(title = "Distribution of output", x = "Firm output", y = "Number of firms") +
theme_icae()
relationship_pop <- ggplot(
data = pop_data, mapping = aes(x = labor, y=output)) +
geom_point(alpha=0.25, color="white", shape=21, fill="black") +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "True relationship", x = "Labor input", y = "Firm output") +
theme_icae()
relationship_pop
pop_overview <- ggarrange(output_dist_pop, relationship_pop, ncol = 2)
pop_overview <- annotate_figure(pop_overview, top = "The simulated population")
ggsave(
plot = pop_overview,
filename = here("output/pop_overview.pdf"),
width = 7, height = 3)
# Estimation----------------------
set.seed(1234)
iterations <- 1000
sample_size <- 150
get_estimates <- function(s_size){
sample_drawn <- slice_sample(pop_data, n = sample_size)
estimated_model <- lm(formula = output~labor, data = sample_drawn)
coefs <- tibble(
"beta_0"=estimated_model$coefficients["(Intercept)"],
"beta_1"=estimated_model$coefficients["labor"])
}
estimates <- map(
.x = seq_len(iterations),
.f = ~ get_estimates(s_size = sample_size))
estimates_tib <- list_rbind(estimates)
# Visualizaiton----------------------------------
beta_0_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_0)) +
geom_histogram(binwidth = 5, fill = get_euf_colors("blue")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
labs(
title = TeX("$\\beta_0=1000$"),
x = TeX("$\\hat{\\beta}_0$"),
y = "Frequency"
) +
geom_vline(xintercept = beta_0_pop) +
theme_icae() +
theme(plot.title = element_text(size=12))
beta_1_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_1)) +
geom_histogram(binwidth = 0.01, fill = get_euf_colors("blue")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
labs(
title = TeX("$\\beta_1=3.5$"),
x = TeX("$\\hat{\\beta}_1$"),
y = "Frequency"
) +
geom_vline(xintercept = beta_1_pop) +
theme_icae() +
theme(plot.title = element_text(size=12))
estimations_figure <- ggarrange(beta_0_dist, beta_1_dist, ncol = 2)
estimations_figure <- annotate_figure(
p = estimations_figure,
top = "Sampling distributions of the estimated parameters")
ggsave(
plot = estimations_figure,
filename = here("output/estimations_figure.pdf"),
width = 7, height = 3)
# Quantitative result-------------------
library(kableExtra)
estimates_tib %>%
pivot_longer(
cols = everything(),
names_to = "Parameter",
values_to = "Value") %>%
summarise(
Mean = mean(Value),
SD = sd(Value), .by = "Parameter"
) %>%
kable(digits = 3, format = "latex", booktabs=TRUE) %>%
kable_styling() %>%
save_kable(file = here("output/estimations_table.pdf"), bs_theme = "flatly")
set.seed(1234)
expl_model <- lm(
formula = output~labor,
data = slice_sample(pop_data, n = sample_size))
expl_model_summary <- summary(expl_model)
b1_est <- expl_model_summary$coefficients[2,1]
b1_sd <- expl_model_summary$coefficients[2,2]
lower_ci <- b1_est - (1.96*b1_sd)
upper_ci <- b1_est + (1.96*b1_sd)
c(lower_ci,upper_ci)
moderndive::get_regression_table(
model = expl_model,
digits = 2) %>%
kable(digits = 3, format = "latex", booktabs=TRUE) %>%
kable_styling() %>%
save_kable(file = here("output/lm_table.pdf"), bs_theme = "flatly")
here::i_am("wine-recap.R")
library(DataScienceExercises)
library(dplyr)
library(tidyr)
wine_data <- DataScienceExercises::wine2dine
wine_reg <- lm(`residual sugar` ~ kind, data = wine_data)
summary(wine_reg)
wine_data %>%
summarise(MeanResidualSugar = mean(`residual sugar`), .by = "kind") %>%
pivot_wider(names_from = "kind", values_from = "MeanResidualSugar") %>%
mutate(DifferenceRed = white - red)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment