Instantly share code, notes, and snippets.

# graebnerc/#T17T18: Lecture notes

Last active June 25, 2023 22:58
Show Gist options
• Save graebnerc/2696373159c2853ef28d1bc60d529dc8 to your computer and use it in GitHub Desktop.
The examples used during session 17 and 18 on multiple linear regression, as well as possible solutions to the exercises.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 Contains solutions to the exercises as well as the code used to create the relevant figures in the slides. The code assumes a standard folder structure and might need to be adjusted to your case.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("R/Examples-Durkheim.R") library(here) library(dplyr) library(data.table) library(ggplot2) library(modelsummary) library(kableExtra) library(icaeDesign) # Create the artificial Durkheim data set-------- set.seed(123) n_obs <- 1000 social_cohesion <- seq(-0.5, 0.5, length.out = n_obs) p_suicide <- social_cohesion**2 + rnorm(n_obs, mean = 0, sd = 0.005) suicide_data <- tibble( Social cohesion=social_cohesion, Probability for suicide=p_suicide ) %>% dplyr::mutate( Society = ifelse(Social cohesion<=0, "Modern", "Archaic"), Social cohesion squared = Social cohesion**2 ) data.table::fwrite( x = suicide_data, file = here::here("data/suicide_data.csv")) # Estimate the two models-------------- linmod <- lm( formula = Probability for suicide~Social cohesion, data = suicide_data) quadmod <- lm( formula = Probability for suicide~Social cohesion+I(Social cohesion**2), data = suicide_data) # Create a PDF of the regression table: modelsummary( list("Model 1"=linmod, "Model 2"=quadmod), output = "kableExtra", gof_map = c("r.squared"), statistic = NULL) %>% kableExtra::save_kable( file = here("output/RegressionTable-Durkheim.pdf")) # Add the predictions of the models to the data suicide_data_predictions <- suicide_data %>% mutate( Model 1 = predict(linmod), Model 2 = predict(quadmod) ) %>% select(Social cohesion, Model 1, Model 2) %>% tidyr::pivot_longer( cols = c("Model 1", "Model 2"), names_to = "Model", values_to = "Prediction") # Create the three plots--------------- ## Plot 1: the data-------------------- suicide_data_plot <- ggplot( data = suicide_data, mapping = aes(x=Social cohesion, y=Probability for suicide) ) + geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + labs(title = "Suicides a la Durkheim") + theme_icae() suicide_data_plot ggsave(plot = suicide_data_plot, filename = here("output/DurkheimData.pdf"), width = 3.5, height = 3.5) ## Plot 2: the data with cohesion squared-------------------- suicide_data_plot_squared <- ggplot( data = suicide_data, mapping = aes(x=Social cohesion squared, y=Probability for suicide) ) + geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + labs(title = "Suicides a la Durkheim") + theme_icae() suicide_data_plot_squared ggsave(plot = suicide_data_plot_squared, filename = here("output/DurkheimData_squared.pdf"), width = 3.5, height = 3.5) ## Plot 3: the data with cohesion squared-------------------- suicide_data_plot_predict_plot <- ggplot( data = suicide_data, mapping = aes(x=Social cohesion, y=Probability for suicide) ) + geom_point(alpha=0.25, size=0.2, show.legend = FALSE) + geom_line( data = suicide_data_predictions, mapping = aes(x=Social cohesion, y=Prediction, color=Model), linewidth=1 ) + scale_color_euf(palette = "mixed") + labs(title = "Suicides a la Durkheim") + theme_icae() + theme(legend.position = "bottom") suicide_data_plot_predict_plot ggsave(plot = suicide_data_plot_predict_plot, filename = here("output/Durkheim-Predictions.pdf"), width = 3.5, height = 3.5) ## Tukey-Anscombe plots------------------- simple_model_TA_plot <- tibble( "Fitted values" = linmod$fitted.values, "Residuals" = linmod$residuals ) %>% ggplot(mapping = aes(x = Fitted values, y = Residuals)) + geom_point(alpha=0.25) + labs(title = "TA plot (model 1)") + theme_icae() simple_model_TA_plot ggsave(plot = simple_model_TA_plot, filename = here("output/Durkheim-TA-Simple.pdf"), width = 3.5, height = 3.5) quadratic_model_plot <- tibble( "Fitted values" = quadmod$fitted.values, "Residuals" = quadmod$residuals ) %>% ggplot(mapping = aes(x = Fitted values, y = Residuals)) + geom_point(alpha=0.25) + labs(title = "TA plot (model 2)") + theme_icae() quadratic_model_plot ggsave(plot = quadratic_model_plot, filename = here("output/Durkheim-TA-Quadratic.pdf"), width = 3.5, height = 3.5)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("R/Examples-LifeExpectancy.R") library(here) library(dplyr) library(data.table) library(ggplot2) library(scales) library(modelsummary) library(kableExtra) library(icaeDesign) # Set up data set --------------------- gdp_life <- DataScienceExercises::gdplifexp2007 %>% mutate(GDPpc_log=log(gdpPercap), LifeExp_log=log(lifeExp)) %>% filter(continent!="Oceania", continent!="Africa", lifeExp>50) # Estimate the two models-------------- model_1 <- lm(lifeExp~gdpPercap, data = gdp_life) model_2 <- lm(LifeExp_log~GDPpc_log, data = gdp_life) # Compute predictions from models----------- gdp_life <- gdp_life %>% mutate( Model 1 = predict(model_1), Model 2 = predict(model_2) ) # Create the visualizations------------ ## Plot 1: Level scatter plot with simple model---------- level_plot <- ggplot( data = gdp_life, aes(x=gdpPercap, y=lifeExp) ) + geom_point() + geom_line( aes(x=gdpPercap, y=Model 1), color=get_euf_colors("red"), linewidth=1) + labs( title = "Income and life expectancy", x="GDP per capita (PPP)", y="Life expectancy at birth" ) + scale_x_continuous( labels = scales::label_number(scale = 0.001, suffix = "k") ) + theme_icae() level_plot ggsave( plot = level_plot, filename = here("output/LifeExp_base.pdf"), width = 3.5, height = 3.5) ggsave( plot = level_plot + geom_line( aes(x=gdpPercap, y=exp(Model 2)), color=get_euf_colors("blue"), linewidth=1), filename = here("output/LifeExp_TwoModels.pdf"), width = 3.5, height = 3.5) ## Plot 2: TA plot for the simple model---------- TA_lin <- tibble( "Fitted values"=model_1$fitted.values, "Residuals"=model_1$residuals ) %>% ggplot(aes(x=Fitted values,y=Residuals)) + geom_point() + theme_icae() ggsave( plot = TA_lin + labs(title = "TA plot (model 1)"), filename = here("output/LifeExp_lin_TA.pdf"), width = 3.5, height = 2.5) ## Plot 3: Log-log scatter plot with quadratic model---------- log_plot <- ggplot( data = gdp_life, aes(x=GDPpc_log, y=LifeExp_log) ) + geom_point() + geom_line( aes(x=GDPpc_log, y=Model 2), color=get_euf_colors("blue"), linewidth=1) + labs( title = "Income and life expectancy (log-log)", x="GDP per capita (PPP, log)", y="Life expectancy at birth (log)" ) + theme_icae() + theme(legend.position = "none") log_plot ggsave( plot = log_plot, filename = here("output/LifeExp_log.pdf"), width = 3.5, height = 3.5) ## Plot 4: TA plot for the log-log model--------- TA_log <- tibble( "Fitted values"=model_2$fitted.values, "Residuals"=model_2$residuals ) %>% ggplot(aes(x=Fitted values,y=Residuals)) + geom_point() + theme_icae() ggsave( plot = TA_log + labs(title = "TA plot (model 2)"), filename = here("output/LifeExp_log_TA.pdf"), width = 3.5, height = 2.5)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("R/Exercise-1-Beer.R") library(DataScienceExercises) library(dplyr) library(texreg) # To visualize models in a readable way library(kableExtra) # If you want to save the table as PDF beer_data <- DataScienceExercises::beer %>% dplyr::mutate(income = income/1000) model_1 <- lm( formula = consumption ~ price, data=beer_data) model_2 <- lm( formula = consumption ~ price + income, data=beer_data) model_3 <- lm( formula = consumption ~ price + price_liquor + price_other + income, data=beer_data) model_4 <- lm( formula = consumption ~ price + price_other + income, data=beer_data) # Print model results nicely on screen: texreg::screenreg( l = list( model_1, model_2, model_3, model_4), digits = 4) # Create a nice PDF with the regression table: modelsummary::modelsummary( models = list(model_1, model_2, model_3, model_4), gof_map = c("nobs", "r.squared", "adj.r.squared", "rmse"), stars = TRUE, output = "kableExtra") %>% kableExtra::save_kable(file = "Exercise-1-Beer.pdf")
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("JournalExample.R") library(DataScienceExercises) library(moderndive) library(dplyr) library(tidyr) library(ggplot2) library(icaeDesign) # Multiple regression with the journal data set---------------------- journal_data <- DataScienceExercises::econjournals %>% dplyr::filter(papers>10, sub_price<5000) # The parallel slopes model------------ journal_linmod_psm <- lm(sub_price~pages_py+publisher_type, data = journal_data) get_regression_table(journal_linmod_psm) journal_linmod_pslopes <- journal_data %>% ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) + geom_point() + geom_parallel_slopes(se = FALSE) + labs(y="Subscription price", x = "Page length") + scale_color_brewer(palette = "Set1", direction = -1) + coord_cartesian(xlim = c(0, 2800)) + theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) + theme(legend.position = "bottom", legend.title = element_text()) journal_linmod_pslopes # The interaction model------------ journal_linmod_intct <- lm(sub_price~pages_py*publisher_type, data = journal_data) get_regression_table(journal_linmod_intct) journal_linmod_interact <- journal_data %>% ggplot(data = ., aes(x=pages_py, y=sub_price, color=publisher_type)) + geom_point() + stat_smooth(method="lm", fullrange=TRUE, se=FALSE) + labs(y="Subscription price", x = "Page length") + scale_color_brewer(palette = "Set1", direction = -1) + coord_cartesian(xlim = c(0, 2800)) + theme_icae() + guides(col=guide_legend(title = "Publisher type: ")) + theme(legend.position = "bottom", legend.title = element_text()) journal_linmod_interact journal_linmod_interact_zoom <- journal_linmod_interact + coord_cartesian(xlim = c(-10, 500), ylim = c(-10, 500)) + stat_smooth(method="lm", fullrange=TRUE, se=FALSE) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + theme(legend.position = "none") journal_linmod_interact_zoom # Comparing the R-squareds------------- summary(journal_linmod_intct)[["r.squared"]] summary(journal_linmod_intct)[["adj.r.squared"]] summary(journal_linmod_psm)[["r.squared"]] summary(journal_linmod_psm)[["adj.r.squared"]] # Compare the residual plots----------- TA_pslopes <- tibble( "Fitted"=journal_linmod_psm$fitted.values, "Residuals"=journal_linmod_psm$residuals ) %>% ggplot(aes(x=Fitted, y=Residuals)) + geom_point() + geom_hline(yintercept = 0) + labs(title = "The parallel slopes model") + theme_icae() TA_pslopes TA_interaction <- tibble( "Fitted"=journal_linmod_intct$fitted.values, "Residuals"=journal_linmod_intct$residuals ) %>% ggplot(aes(x=Fitted, y=Residuals)) + geom_point() + geom_hline(yintercept = 0) + labs(title = "The interaction model") + theme_icae() TA_interaction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("LifeExpExample.R") library(DataScienceExercises) library(moderndive) library(dplyr) library(tidyr) library(ggplot2) library(icaeDesign) life_exp <- DataScienceExercises::gdplifexp2007 %>% dplyr::select(all_of(c("continent", "lifeExp", "gdpPercap"))) %>% dplyr::mutate(continent=as.factor(continent)) # Categorical variables within the multiple regression case-------------------- cont_gdp_linmod <- lm(lifeExp~gdpPercap+continent, data = life_exp) get_regression_table(cont_gdp_linmod) life_exp_scaled <- life_exp %>% dplyr::mutate( gdpPercap = gdpPercap/1000 ) # Easier to read: cont_gdp_linmod_sc <- lm(lifeExp~gdpPercap+continent, data = life_exp_scaled) get_regression_table(cont_gdp_linmod_sc) %>% dplyr::select(term, estimate, std_error) # Linear mode: life_model_lm <- lm(lifeExp~continent, data = life_exp_scaled) summary(life_model_lm) # Get same result via data wrangling and manipulation: MeanAfrica <- life_exp_scaled %>% filter(continent=="Africa") %>% summarize(lifeExp=mean(lifeExp)) %>% pull(lifeExp) life_exp_scaled %>% summarise(MeanLifeExp = mean(lifeExp), .by = "continent") %>% mutate(DifferenceAfrica = MeanLifeExp - MeanAfrica)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("ols-distribution.R") library(here) library(tidyr) library(data.table) library(dplyr) library(ggplot2) library(scales) library(purrr) library(icaeDesign) library(ggpubr) library(latex2exp) # True population---------------------- set.seed(123) pop_size <- 5000 beta_0_pop <- 1000 beta_1_pop <- 3.5 labor_input <- runif(n = pop_size, min = 50, max = 500) random_error <- rnorm(n = pop_size, mean = 0, sd = 100) firm_output <- beta_0_pop + beta_1_pop * labor_input + random_error pop_data <- tibble( "output" = firm_output, "labor" = labor_input ) output_dist_pop <- ggplot(data = pop_data, mapping = aes(x=output)) + geom_histogram(alpha=0.5, binwidth = 50) + scale_y_continuous(expand = expansion(add = c(0, 20))) + labs(title = "Distribution of output", x = "Firm output", y = "Number of firms") + theme_icae() relationship_pop <- ggplot( data = pop_data, mapping = aes(x = labor, y=output)) + geom_point(alpha=0.25, color="white", shape=21, fill="black") + geom_smooth(method = "lm", se = FALSE) + labs(title = "True relationship", x = "Labor input", y = "Firm output") + theme_icae() relationship_pop pop_overview <- ggarrange(output_dist_pop, relationship_pop, ncol = 2) pop_overview <- annotate_figure(pop_overview, top = "The simulated population") ggsave( plot = pop_overview, filename = here("output/pop_overview.pdf"), width = 7, height = 3) # Estimation---------------------- set.seed(1234) iterations <- 1000 sample_size <- 150 get_estimates <- function(s_size){ sample_drawn <- slice_sample(pop_data, n = sample_size) estimated_model <- lm(formula = output~labor, data = sample_drawn) coefs <- tibble( "beta_0"=estimated_model$coefficients["(Intercept)"], "beta_1"=estimated_model$coefficients["labor"]) } estimates <- map( .x = seq_len(iterations), .f = ~ get_estimates(s_size = sample_size)) estimates_tib <- list_rbind(estimates) # Visualizaiton---------------------------------- beta_0_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_0)) + geom_histogram(binwidth = 5, fill = get_euf_colors("blue")) + scale_y_continuous(expand = expansion(mult = c(0, 0.1))) + labs( title = TeX("$\\beta_0=1000$"), x = TeX("$\\hat{\\beta}_0$"), y = "Frequency" ) + geom_vline(xintercept = beta_0_pop) + theme_icae() + theme(plot.title = element_text(size=12)) beta_1_dist <- ggplot(data = estimates_tib, mapping = aes(x=beta_1)) + geom_histogram(binwidth = 0.01, fill = get_euf_colors("blue")) + scale_y_continuous(expand = expansion(mult = c(0, 0.1))) + labs( title = TeX("$\\beta_1=3.5$"), x = TeX("$\\hat{\\beta}_1$"), y = "Frequency" ) + geom_vline(xintercept = beta_1_pop) + theme_icae() + theme(plot.title = element_text(size=12)) estimations_figure <- ggarrange(beta_0_dist, beta_1_dist, ncol = 2) estimations_figure <- annotate_figure( p = estimations_figure, top = "Sampling distributions of the estimated parameters") ggsave( plot = estimations_figure, filename = here("output/estimations_figure.pdf"), width = 7, height = 3) # Quantitative result------------------- library(kableExtra) estimates_tib %>% pivot_longer( cols = everything(), names_to = "Parameter", values_to = "Value") %>% summarise( Mean = mean(Value), SD = sd(Value), .by = "Parameter" ) %>% kable(digits = 3, format = "latex", booktabs=TRUE) %>% kable_styling() %>% save_kable(file = here("output/estimations_table.pdf"), bs_theme = "flatly") set.seed(1234) expl_model <- lm( formula = output~labor, data = slice_sample(pop_data, n = sample_size)) expl_model_summary <- summary(expl_model) b1_est <- expl_model_summary$coefficients[2,1] b1_sd <- expl_model_summary$coefficients[2,2] lower_ci <- b1_est - (1.96*b1_sd) upper_ci <- b1_est + (1.96*b1_sd) c(lower_ci,upper_ci) moderndive::get_regression_table( model = expl_model, digits = 2) %>% kable(digits = 3, format = "latex", booktabs=TRUE) %>% kable_styling() %>% save_kable(file = here("output/lm_table.pdf"), bs_theme = "flatly")
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 here::i_am("wine-recap.R") library(DataScienceExercises) library(dplyr) library(tidyr) wine_data <- DataScienceExercises::wine2dine wine_reg <- lm(residual sugar ~ kind, data = wine_data) summary(wine_reg) wine_data %>% summarise(MeanResidualSugar = mean(residual sugar), .by = "kind") %>% pivot_wider(names_from = "kind", values_from = "MeanResidualSugar") %>% mutate(DifferenceRed = white - red)