Last active
June 25, 2023 22:58
-
-
Save graebnerc/2696373159c2853ef28d1bc60d529dc8 to your computer and use it in GitHub Desktop.
The examples used during session 17 and 18 on multiple linear regression, as well as possible solutions to the exercises.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contains solutions to the exercises as well as the code used to create the relevant figures in the slides. | |
The code assumes a standard folder structure and might need to be adjusted to your case. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/Examples-Durkheim.R") | |
library(here) | |
library(dplyr) | |
library(data.table) | |
library(ggplot2) | |
library(modelsummary) | |
library(kableExtra) | |
library(icaeDesign) | |
# Create the artificial Durkheim data set-------- | |
set.seed(123) | |
n_obs <- 1000 | |
social_cohesion <- seq(-0.5, 0.5, length.out = n_obs) | |
p_suicide <- social_cohesion**2 + rnorm(n_obs, mean = 0, sd = 0.005) | |
suicide_data <- tibble( | |
`Social cohesion`=social_cohesion, | |
`Probability for suicide`=p_suicide | |
) %>% | |
dplyr::mutate( | |
Society = ifelse(`Social cohesion`<=0, "Modern", "Archaic"), | |
`Social cohesion squared` = `Social cohesion`**2 | |
) | |
data.table::fwrite( | |
x = suicide_data, file = here::here("data/suicide_data.csv")) | |
# Estimate the two models-------------- | |
linmod <- lm( | |
formula = `Probability for suicide`~`Social cohesion`, | |
data = suicide_data) | |
quadmod <- lm( | |
formula = `Probability for suicide`~`Social cohesion`+I(`Social cohesion`**2), | |
data = suicide_data) | |
# Create a PDF of the regression table: | |
modelsummary( | |
list("Model 1"=linmod, "Model 2"=quadmod), | |
output = "kableExtra", gof_map = c("r.squared"), | |
statistic = NULL) %>% | |
kableExtra::save_kable( | |
file = here("output/RegressionTable-Durkheim.pdf")) | |
# Add the predictions of the models to the data | |
suicide_data_predictions <- suicide_data %>% | |
mutate( | |
`Model 1` = predict(linmod), | |
`Model 2` = predict(quadmod) | |
) %>% | |
select(`Social cohesion`, `Model 1`, `Model 2`) %>% | |
tidyr::pivot_longer( | |
cols = c("Model 1", "Model 2"), | |
names_to = "Model", | |
values_to = "Prediction") | |
# Create the three plots--------------- | |
## Plot 1: the data-------------------- | |
suicide_data_plot <- ggplot( | |
data = suicide_data, | |
mapping = aes(x=`Social cohesion`, y=`Probability for suicide`) | |
) + | |
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + | |
labs(title = "Suicides a la Durkheim") + | |
theme_icae() | |
suicide_data_plot | |
ggsave(plot = suicide_data_plot, | |
filename = here("output/DurkheimData.pdf"), | |
width = 3.5, height = 3.5) | |
## Plot 2: the data with cohesion squared-------------------- | |
suicide_data_plot_squared <- ggplot( | |
data = suicide_data, | |
mapping = aes(x=`Social cohesion squared`, y=`Probability for suicide`) | |
) + | |
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + | |
labs(title = "Suicides a la Durkheim") + | |
theme_icae() | |
suicide_data_plot_squared | |
ggsave(plot = suicide_data_plot_squared, | |
filename = here("output/DurkheimData_squared.pdf"), | |
width = 3.5, height = 3.5) | |
## Plot 3: the data with cohesion squared-------------------- | |
suicide_data_plot_predict_plot <- ggplot( | |
data = suicide_data, | |
mapping = aes(x=`Social cohesion`, y=`Probability for suicide`) | |
) + | |
geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + | |
geom_line( | |
data = suicide_data_predictions, | |
mapping = aes(x=`Social cohesion`, y=Prediction, color=Model), linewidth=1 | |
) + | |
scale_color_euf(palette = "mixed") + | |
labs(title = "Suicides a la Durkheim") + | |
theme_icae() + | |
theme(legend.position = "bottom") | |
suicide_data_plot_predict_plot | |
ggsave(plot = suicide_data_plot_predict_plot, | |
filename = here("output/Durkheim-Predictions.pdf"), | |
width = 3.5, height = 3.5) | |
## Tukey-Anscombe plots------------------- | |
simple_model_TA_plot <- tibble( | |
"Fitted values" = linmod$fitted.values, | |
"Residuals" = linmod$residuals | |
) %>% | |
ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) + | |
geom_point(alpha=0.25) + | |
labs(title = "TA plot (model 1)") + | |
theme_icae() | |
simple_model_TA_plot | |
ggsave(plot = simple_model_TA_plot, | |
filename = here("output/Durkheim-TA-Simple.pdf"), | |
width = 3.5, height = 3.5) | |
quadratic_model_plot <- tibble( | |
"Fitted values" = quadmod$fitted.values, | |
"Residuals" = quadmod$residuals | |
) %>% | |
ggplot(mapping = aes(x = `Fitted values`, y = `Residuals`)) + | |
geom_point(alpha=0.25) + | |
labs(title = "TA plot (model 2)") + | |
theme_icae() | |
quadratic_model_plot | |
ggsave(plot = quadratic_model_plot, | |
filename = here("output/Durkheim-TA-Quadratic.pdf"), | |
width = 3.5, height = 3.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/Examples-LifeExpectancy.R") | |
library(here) | |
library(dplyr) | |
library(data.table) | |
library(ggplot2) | |
library(scales) | |
library(modelsummary) | |
library(kableExtra) | |
library(icaeDesign) | |
# Set up data set --------------------- | |
gdp_life <- DataScienceExercises::gdplifexp2007 %>% | |
mutate(GDPpc_log=log(gdpPercap), LifeExp_log=log(lifeExp)) %>% | |
filter(continent!="Oceania", continent!="Africa", lifeExp>50) | |
# Estimate the two models-------------- | |
model_1 <- lm(lifeExp~gdpPercap, data = gdp_life) | |
model_2 <- lm(LifeExp_log~GDPpc_log, data = gdp_life) | |
# Compute predictions from models----------- | |
gdp_life <- gdp_life %>% | |
mutate( | |
`Model 1` = predict(model_1), | |
`Model 2` = predict(model_2) | |
) | |
# Create the visualizations------------ | |
## Plot 1: Level scatter plot with simple model---------- | |
level_plot <- ggplot( | |
data = gdp_life, | |
aes(x=gdpPercap, y=lifeExp) | |
) + | |
geom_point() + | |
geom_line( | |
aes(x=gdpPercap, y=`Model 1`), | |
color=get_euf_colors("red"), | |
linewidth=1) + | |
labs( | |
title = "Income and life expectancy", | |
x="GDP per capita (PPP)", | |
y="Life expectancy at birth" | |
) + | |
scale_x_continuous( | |
labels = scales::label_number(scale = 0.001, suffix = "k") | |
) + | |
theme_icae() | |
level_plot | |
ggsave( | |
plot = level_plot, | |
filename = here("output/LifeExp_base.pdf"), | |
width = 3.5, height = 3.5) | |
ggsave( | |
plot = level_plot + | |
geom_line( | |
aes(x=gdpPercap, y=exp(`Model 2`)), | |
color=get_euf_colors("blue"), | |
linewidth=1), | |
filename = here("output/LifeExp_TwoModels.pdf"), | |
width = 3.5, height = 3.5) | |
## Plot 2: TA plot for the simple model---------- | |
TA_lin <- tibble( | |
"Fitted values"=model_1$fitted.values, | |
"Residuals"=model_1$residuals | |
) %>% | |
ggplot(aes(x=`Fitted values`,y=Residuals)) + | |
geom_point() + | |
theme_icae() | |
ggsave( | |
plot = TA_lin + labs(title = "TA plot (model 1)"), | |
filename = here("output/LifeExp_lin_TA.pdf"), | |
width = 3.5, height = 2.5) | |
## Plot 3: Log-log scatter plot with quadratic model---------- | |
log_plot <- ggplot( | |
data = gdp_life, | |
aes(x=GDPpc_log, y=LifeExp_log) | |
) + | |
geom_point() + | |
geom_line( | |
aes(x=GDPpc_log, y=`Model 2`), | |
color=get_euf_colors("blue"), | |
linewidth=1) + | |
labs( | |
title = "Income and life expectancy (log-log)", | |
x="GDP per capita (PPP, log)", | |
y="Life expectancy at birth (log)" | |
) + | |
theme_icae() + | |
theme(legend.position = "none") | |
log_plot | |
ggsave( | |
plot = log_plot, | |
filename = here("output/LifeExp_log.pdf"), | |
width = 3.5, height = 3.5) | |
## Plot 4: TA plot for the log-log model--------- | |
TA_log <- tibble( | |
"Fitted values"=model_2$fitted.values, | |
"Residuals"=model_2$residuals | |
) %>% | |
ggplot(aes(x=`Fitted values`,y=Residuals)) + | |
geom_point() + | |
theme_icae() | |
ggsave( | |
plot = TA_log + labs(title = "TA plot (model 2)"), | |
filename = here("output/LifeExp_log_TA.pdf"), | |
width = 3.5, height = 2.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/Exercise-1-Beer.R") | |
library(DataScienceExercises) | |
library(dplyr) | |
library(texreg) # To visualize models in a readable way | |
library(kableExtra) # If you want to save the table as PDF | |
beer_data <- DataScienceExercises::beer %>% | |
dplyr::mutate(income = income/1000) | |
model_1 <- lm( | |
formula = consumption ~ price, | |
data=beer_data) | |
model_2 <- lm( | |
formula = consumption ~ price + income, | |
data=beer_data) | |
model_3 <- lm( | |
formula = consumption ~ price + price_liquor + price_other + income, | |
data=beer_data) | |
model_4 <- lm( | |
formula = consumption ~ price + price_other + income, | |
data=beer_data) | |
# Print model results nicely on screen: | |
texreg::screenreg( | |
l = list( | |
model_1, model_2, model_3, model_4), | |
digits = 4) | |
# Create a nice PDF with the regression table: | |
modelsummary::modelsummary( | |
models = list(model_1, model_2, model_3, model_4), | |
gof_map = c("nobs", "r.squared", "adj.r.squared", "rmse"), | |
stars = TRUE, output = "kableExtra") %>% | |
kableExtra::save_kable(file = "Exercise-1-Beer.pdf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("JournalExample.R") | |
library(DataScienceExercises) | |
library(moderndive) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(icaeDesign) | |
# Multiple regression with the journal data set---------------------- | |
journal_data <- DataScienceExercises::econjournals %>% | |
dplyr::filter(papers>10, sub_price<5000) | |
# The parallel slopes model------------ | |
journal_linmod_psm <- lm(sub_price~pages_py+publisher_type, data = journal_data) | |
get_regression_table(journal_linmod_psm) | |
journal_linmod_pslopes <- journal_data %>% | |
ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) + | |
geom_point() + | |
geom_parallel_slopes(se = FALSE) + | |
labs(y="Subscription price", x = "Page length") + | |
scale_color_brewer(palette = "Set1", direction = -1) + | |
coord_cartesian(xlim = c(0, 2800)) + | |
theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) + | |
theme(legend.position = "bottom", legend.title = element_text()) | |
journal_linmod_pslopes | |
# The interaction model------------ | |
journal_linmod_intct <- lm(sub_price~pages_py*publisher_type, data = journal_data) | |
get_regression_table(journal_linmod_intct) | |
journal_linmod_interact <- journal_data %>% | |
ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) + | |
geom_point() + | |
stat_smooth(method="lm", fullrange=TRUE, se=FALSE) + | |
labs(y="Subscription price", x = "Page length") + | |
scale_color_brewer(palette = "Set1", direction = -1) + | |
coord_cartesian(xlim = c(0, 2800)) + | |
theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) + | |
theme(legend.position = "bottom", legend.title = element_text()) | |
journal_linmod_interact | |
journal_linmod_interact_zoom <- journal_linmod_interact + | |
coord_cartesian(xlim = c(-10, 500), ylim = c(-10, 500)) + | |
stat_smooth(method="lm", fullrange=TRUE, se=FALSE) + | |
geom_hline(yintercept = 0) + | |
geom_vline(xintercept = 0) + | |
theme(legend.position = "none") | |
journal_linmod_interact_zoom | |
# Comparing the R-squareds------------- | |
summary(journal_linmod_intct)[["r.squared"]] | |
summary(journal_linmod_intct)[["adj.r.squared"]] | |
summary(journal_linmod_psm)[["r.squared"]] | |
summary(journal_linmod_psm)[["adj.r.squared"]] | |
# Compare the residual plots----------- | |
TA_pslopes <- tibble( | |
"Fitted"=journal_linmod_psm$fitted.values, | |
"Residuals"=journal_linmod_psm$residuals | |
) %>% | |
ggplot(aes(x=Fitted, y=Residuals)) + | |
geom_point() + geom_hline(yintercept = 0) + | |
labs(title = "The parallel slopes model") + | |
theme_icae() | |
TA_pslopes | |
TA_interaction <- tibble( | |
"Fitted"=journal_linmod_intct$fitted.values, | |
"Residuals"=journal_linmod_intct$residuals | |
) %>% | |
ggplot(aes(x=Fitted, y=Residuals)) + | |
geom_point() + geom_hline(yintercept = 0) + | |
labs(title = "The interaction model") + | |
theme_icae() | |
TA_interaction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("LifeExpExample.R") | |
library(DataScienceExercises) | |
library(moderndive) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(icaeDesign) | |
life_exp <- DataScienceExercises::gdplifexp2007 %>% | |
dplyr::select(all_of(c("continent", "lifeExp", "gdpPercap"))) %>% | |
dplyr::mutate(continent=as.factor(continent)) | |
# Categorical variables within the multiple regression case-------------------- | |
cont_gdp_linmod <- lm(lifeExp~gdpPercap+continent, data = life_exp) | |
get_regression_table(cont_gdp_linmod) | |
life_exp_scaled <- life_exp %>% | |
dplyr::mutate( | |
gdpPercap = gdpPercap/1000 | |
) | |
# Easier to read: | |
cont_gdp_linmod_sc <- lm(lifeExp~gdpPercap+continent, data = life_exp_scaled) | |
get_regression_table(cont_gdp_linmod_sc) %>% | |
dplyr::select(term, estimate, std_error) | |
# Linear mode: | |
life_model_lm <- lm(lifeExp~continent, data = life_exp_scaled) | |
summary(life_model_lm) | |
# Get same result via data wrangling and manipulation: | |
MeanAfrica <- life_exp_scaled %>% | |
filter(continent=="Africa") %>% | |
summarize(lifeExp=mean(lifeExp)) %>% | |
pull(lifeExp) | |
life_exp_scaled %>% | |
summarise(MeanLifeExp = mean(lifeExp), .by = "continent") %>% | |
mutate(DifferenceAfrica = MeanLifeExp - MeanAfrica) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("ols-distribution.R") | |
library(here) | |
library(tidyr) | |
library(data.table) | |
library(dplyr) | |
library(ggplot2) | |
library(scales) | |
library(purrr) | |
library(icaeDesign) | |
library(ggpubr) | |
library(latex2exp) | |
# True population---------------------- | |
set.seed(123) | |
pop_size <- 5000 | |
beta_0_pop <- 1000 | |
beta_1_pop <- 3.5 | |
labor_input <- runif(n = pop_size, min = 50, max = 500) | |
random_error <- rnorm(n = pop_size, mean = 0, sd = 100) | |
firm_output <- beta_0_pop + beta_1_pop * labor_input + random_error | |
pop_data <- tibble( | |
"output" = firm_output, | |
"labor" = labor_input | |
) | |
output_dist_pop <- ggplot(data = pop_data, mapping = aes(x=output)) + | |
geom_histogram(alpha=0.5, binwidth = 50) + | |
scale_y_continuous(expand = expansion(add = c(0, 20))) + | |
labs(title = "Distribution of output", x = "Firm output", y = "Number of firms") + | |
theme_icae() | |
relationship_pop <- ggplot( | |
data = pop_data, mapping = aes(x = labor, y=output)) + | |
geom_point(alpha=0.25, color="white", shape=21, fill="black") + | |
geom_smooth(method = "lm", se = FALSE) + | |
labs(title = "True relationship", x = "Labor input", y = "Firm output") + | |
theme_icae() | |
relationship_pop | |
pop_overview <- ggarrange(output_dist_pop, relationship_pop, ncol = 2) | |
pop_overview <- annotate_figure(pop_overview, top = "The simulated population") | |
ggsave( | |
plot = pop_overview, | |
filename = here("output/pop_overview.pdf"), | |
width = 7, height = 3) | |
# Estimation---------------------- | |
set.seed(1234) | |
iterations <- 1000 | |
sample_size <- 150 | |
get_estimates <- function(s_size){ | |
sample_drawn <- slice_sample(pop_data, n = sample_size) | |
estimated_model <- lm(formula = output~labor, data = sample_drawn) | |
coefs <- tibble( | |
"beta_0"=estimated_model$coefficients["(Intercept)"], | |
"beta_1"=estimated_model$coefficients["labor"]) | |
} | |
estimates <- map( | |
.x = seq_len(iterations), | |
.f = ~ get_estimates(s_size = sample_size)) | |
estimates_tib <- list_rbind(estimates) | |
# Visualizaiton---------------------------------- | |
beta_0_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_0)) + | |
geom_histogram(binwidth = 5, fill = get_euf_colors("blue")) + | |
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) + | |
labs( | |
title = TeX("$\\beta_0=1000$"), | |
x = TeX("$\\hat{\\beta}_0$"), | |
y = "Frequency" | |
) + | |
geom_vline(xintercept = beta_0_pop) + | |
theme_icae() + | |
theme(plot.title = element_text(size=12)) | |
beta_1_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_1)) + | |
geom_histogram(binwidth = 0.01, fill = get_euf_colors("blue")) + | |
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) + | |
labs( | |
title = TeX("$\\beta_1=3.5$"), | |
x = TeX("$\\hat{\\beta}_1$"), | |
y = "Frequency" | |
) + | |
geom_vline(xintercept = beta_1_pop) + | |
theme_icae() + | |
theme(plot.title = element_text(size=12)) | |
estimations_figure <- ggarrange(beta_0_dist, beta_1_dist, ncol = 2) | |
estimations_figure <- annotate_figure( | |
p = estimations_figure, | |
top = "Sampling distributions of the estimated parameters") | |
ggsave( | |
plot = estimations_figure, | |
filename = here("output/estimations_figure.pdf"), | |
width = 7, height = 3) | |
# Quantitative result------------------- | |
library(kableExtra) | |
estimates_tib %>% | |
pivot_longer( | |
cols = everything(), | |
names_to = "Parameter", | |
values_to = "Value") %>% | |
summarise( | |
Mean = mean(Value), | |
SD = sd(Value), .by = "Parameter" | |
) %>% | |
kable(digits = 3, format = "latex", booktabs=TRUE) %>% | |
kable_styling() %>% | |
save_kable(file = here("output/estimations_table.pdf"), bs_theme = "flatly") | |
set.seed(1234) | |
expl_model <- lm( | |
formula = output~labor, | |
data = slice_sample(pop_data, n = sample_size)) | |
expl_model_summary <- summary(expl_model) | |
b1_est <- expl_model_summary$coefficients[2,1] | |
b1_sd <- expl_model_summary$coefficients[2,2] | |
lower_ci <- b1_est - (1.96*b1_sd) | |
upper_ci <- b1_est + (1.96*b1_sd) | |
c(lower_ci,upper_ci) | |
moderndive::get_regression_table( | |
model = expl_model, | |
digits = 2) %>% | |
kable(digits = 3, format = "latex", booktabs=TRUE) %>% | |
kable_styling() %>% | |
save_kable(file = here("output/lm_table.pdf"), bs_theme = "flatly") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("wine-recap.R") | |
library(DataScienceExercises) | |
library(dplyr) | |
library(tidyr) | |
wine_data <- DataScienceExercises::wine2dine | |
wine_reg <- lm(`residual sugar` ~ kind, data = wine_data) | |
summary(wine_reg) | |
wine_data %>% | |
summarise(MeanResidualSugar = mean(`residual sugar`), .by = "kind") %>% | |
pivot_wider(names_from = "kind", values_from = "MeanResidualSugar") %>% | |
mutate(DifferenceRed = white - red) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment