Created
June 22, 2023 15:13
-
-
Save graebnerc/1ed64ed3e30a0a80d41e6381aabc2e32 to your computer and use it in GitHub Desktop.
In-class solutions for the recap session.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("inclass-regression-solution.R") | |
library(here) | |
library(dplyr) | |
library(tidyr) | |
library(data.table) | |
library(ggplot2) | |
# Step 1: import data---------- | |
reg_data <- data.table::fread(here("data/reg_data_1.csv")) %>% | |
as_tibble() | |
head(reg_data) | |
# Step 2: decide on specification---------- | |
# Check out correlations and decide whether variables should be | |
# included and whether they need to be transformed | |
ggplot(data = reg_data, aes(x=x1, y=y)) + | |
geom_point() + theme_bw() | |
# There is a relationship | |
# The relationship is not linear, but quadratic -> include x1 and x1^2 | |
ggplot(data = reg_data, aes(x=log(x2), y=y)) + | |
geom_point() + theme_bw() | |
# There is a relationship | |
# The relationship is only linear if x2 gets log-transformed | |
ggplot(data = reg_data, aes(x=x3, y=y)) + | |
geom_point() + theme_bw() | |
# There is no relationship, the variable should not be included | |
# Step 3: Fit the model---------- | |
# Two equivalent ways, either do transformation in the formula: | |
lin_model_quad <- lm(y ~x1 + I(x1^2) + log(x2), data = reg_data) | |
# Or add the transformed variables to the data: | |
reg_data <- reg_data %>% | |
mutate(# Alternative way to use transformations in the model | |
x1_squared = x1**2, x2_log = log(x2)) | |
lin_model_quad <- lm(y ~x1 + x1_squared + x2_log, data = reg_data) | |
summary(lin_model_quad) | |
# Step 4 (if needed): assess the model---------- | |
tibble( | |
"Fitted"=lin_model_quad$fitted.values, | |
"Residuals"=lin_model_quad$residuals) %>% | |
ggplot(aes(x=Fitted, y=Residuals)) + | |
geom_hline(yintercept = 0) + | |
geom_point() + theme_bw() | |
# If you omitted the quadratic term: | |
lin_model_lin <- lm(y ~x1 + log(x2), data = reg_data) | |
tibble( | |
"Fitted"=lin_model_lin$fitted.values, | |
"Residuals"=lin_model_lin$residuals) %>% | |
ggplot(aes(x=Fitted, y=Residuals)) + | |
geom_hline(yintercept = 0) + | |
geom_point() + theme_bw() | |
# Hint: Good to have those prepared;) | |
ggplot(data = , aes(x=, y=)) + | |
geom_point() + theme_bw() | |
tibble("Fitted"=, "Residuals"=) %>% | |
ggplot(aes(x=Fitted, y=Residuals)) + | |
geom_hline(yintercept = 0) + | |
geom_point() + theme_bw() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("inclass-wrangling-solutions.R") | |
library(here) | |
library(dplyr) | |
library(tidyr) | |
library(data.table) | |
# In class solution for data wrangling---------------- | |
raw_data <- data.table::fread("data/wrangel_1.csv", header = TRUE) %>% | |
as_tibble() | |
head(raw_data) | |
tidy_data <- raw_data %>% | |
tidyr::pivot_longer( | |
cols = -all_of(c("country", "name")), | |
# cols = as.character(seq(2005, 2020)), | |
names_to = "year", | |
values_to = "observation") %>% | |
tidyr::pivot_wider( | |
names_from = "name", | |
values_from = "observation") | |
head(tidy_data) | |
mean_data <- tidy_data %>% | |
group_by(country) %>% | |
summarise( | |
GrowthMean = mean(Growth, na.rm = TRUE), | |
EducSpendingMean = mean(EducationSpending, na.rm = TRUE), | |
HealthSpendingMean = mean(HealthSpending, na.rm = TRUE) | |
) | |
mean_data | |
# Alternative: | |
mean_data_alt <- tidy_data %>% | |
pivot_longer(cols = -c("country", "year")) %>% | |
# pivot_longer(cols = c("Growth", "EducationSpending", "HealthSpending")) %>% | |
summarise(MeanVal = mean(value, na.rm = TRUE), .by = c("country", "name")) %>% | |
pivot_wider(names_from = "country", values_from = "MeanVal") | |
mean_data_alt | |
# Alternative: | |
mean_data_alt2 <- tidy_data %>% | |
summarise( | |
across( | |
all_of(c("Growth", "EducationSpending", "HealthSpending")), | |
.fns = ~ mean(.x, na.rm = TRUE)), | |
.by = "country") | |
mean_data_alt2 | |
# Extension: | |
# Compute, for each country, the percentage change of the spending on education | |
# from the year 2010 to the year 2018 and save this as a variable called | |
# perc_change. | |
perc_data <- tidy_data %>% | |
select(c("country", "year", "EducationSpending")) %>% | |
dplyr::filter(year %in% c(2010, 2018)) %>% | |
pivot_wider( | |
names_from = "year", | |
values_from = "EducationSpending" | |
) %>% | |
dplyr::mutate(perc_change =(`2018` - `2010`)/`2010`*100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment