graebnerc/inclass-regression-solution.R

## inclass-regression-solution.R
here::i_am("inclass-regression-solution.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)

# Step 1: import data----------
reg_data <- data.table::fread(here("data/reg_data_1.csv")) %>%
  as_tibble()
head(reg_data)

# Step 2: decide on specification----------
# Check out correlations and decide whether variables should be
#  included and whether they need to be transformed
ggplot(data = reg_data, aes(x=x1, y=y)) +
  geom_point() + theme_bw()
# There is a relationship
# The relationship is not linear, but quadratic -> include x1 and x1^2

ggplot(data = reg_data, aes(x=log(x2), y=y)) +
  geom_point() + theme_bw()
# There is a relationship
# The relationship is only linear if x2 gets log-transformed

ggplot(data = reg_data, aes(x=x3, y=y)) +
  geom_point() + theme_bw()
# There is no relationship, the variable should not be included

# Step 3: Fit the model----------
# Two equivalent ways, either do transformation in the formula:
lin_model_quad <- lm(y ~x1 + I(x1^2) + log(x2), data = reg_data)
# Or add the transformed variables to the data:
reg_data <- reg_data %>%
  mutate(# Alternative way to use transformations in the model
    x1_squared = x1**2, x2_log = log(x2))
lin_model_quad <- lm(y ~x1 + x1_squared + x2_log, data = reg_data)

summary(lin_model_quad)


# Step 4 (if needed): assess the model----------
tibble(
  "Fitted"=lin_model_quad$fitted.values,
  "Residuals"=lin_model_quad$residuals) %>%
  ggplot(aes(x=Fitted, y=Residuals)) +
  geom_hline(yintercept = 0) +
  geom_point() + theme_bw()

# If you omitted the quadratic term:
lin_model_lin <- lm(y ~x1 + log(x2), data = reg_data)
tibble(
  "Fitted"=lin_model_lin$fitted.values,
  "Residuals"=lin_model_lin$residuals) %>%
  ggplot(aes(x=Fitted, y=Residuals)) +
  geom_hline(yintercept = 0) +
  geom_point() + theme_bw()


# Hint: Good to have those prepared;)
ggplot(data = , aes(x=, y=)) +
  geom_point() + theme_bw()

tibble("Fitted"=, "Residuals"=) %>%
  ggplot(aes(x=Fitted, y=Residuals)) +
  geom_hline(yintercept = 0) +
  geom_point() + theme_bw()

## inclass-wrangling-solutions.R
here::i_am("inclass-wrangling-solutions.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)

# In class solution for data wrangling----------------
raw_data <- data.table::fread("data/wrangel_1.csv", header = TRUE) %>%
  as_tibble()
head(raw_data)

tidy_data <- raw_data %>%
  tidyr::pivot_longer(
    cols = -all_of(c("country", "name")),
    # cols = as.character(seq(2005, 2020)),
    names_to = "year",
    values_to = "observation") %>%
  tidyr::pivot_wider(
    names_from = "name",
    values_from = "observation")
head(tidy_data)

mean_data <- tidy_data %>%
  group_by(country) %>%
  summarise(
    GrowthMean = mean(Growth, na.rm = TRUE),
    EducSpendingMean = mean(EducationSpending, na.rm = TRUE),
    HealthSpendingMean = mean(HealthSpending, na.rm = TRUE)
  )
mean_data

# Alternative:
mean_data_alt <- tidy_data %>%
  pivot_longer(cols = -c("country", "year")) %>%
  # pivot_longer(cols = c("Growth", "EducationSpending", "HealthSpending")) %>%
  summarise(MeanVal = mean(value, na.rm = TRUE), .by = c("country", "name")) %>%
  pivot_wider(names_from = "country", values_from = "MeanVal")
mean_data_alt

# Alternative:
mean_data_alt2 <- tidy_data %>%
  summarise(
    across(
      all_of(c("Growth", "EducationSpending", "HealthSpending")),
           .fns = ~ mean(.x, na.rm = TRUE)),
    .by = "country")
mean_data_alt2

# Extension:
# Compute, for each country, the percentage change of the spending on education
# from the year 2010 to the year 2018 and save this as a variable called
# perc_change.
perc_data <- tidy_data %>%
  select(c("country", "year", "EducationSpending")) %>%
  dplyr::filter(year %in% c(2010, 2018)) %>%
  pivot_wider(
    names_from = "year",
    values_from = "EducationSpending"
    ) %>%
  dplyr::mutate(perc_change =(`2018` - `2010`)/`2010`*100)
	here::i_am("inclass-regression-solution.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(ggplot2)

	# Step 1: import data----------
	reg_data <- data.table::fread(here("data/reg_data_1.csv")) %>%
	as_tibble()
	head(reg_data)

	# Step 2: decide on specification----------
	# Check out correlations and decide whether variables should be
	# included and whether they need to be transformed
	ggplot(data = reg_data, aes(x=x1, y=y)) +
	geom_point() + theme_bw()
	# There is a relationship
	# The relationship is not linear, but quadratic -> include x1 and x1^2

	ggplot(data = reg_data, aes(x=log(x2), y=y)) +
	geom_point() + theme_bw()
	# There is a relationship
	# The relationship is only linear if x2 gets log-transformed

	ggplot(data = reg_data, aes(x=x3, y=y)) +
	geom_point() + theme_bw()
	# There is no relationship, the variable should not be included

	# Step 3: Fit the model----------
	# Two equivalent ways, either do transformation in the formula:
	lin_model_quad <- lm(y ~x1 + I(x1^2) + log(x2), data = reg_data)
	# Or add the transformed variables to the data:
	reg_data <- reg_data %>%
	mutate(# Alternative way to use transformations in the model
	x1_squared = x1**2, x2_log = log(x2))
	lin_model_quad <- lm(y ~x1 + x1_squared + x2_log, data = reg_data)

	summary(lin_model_quad)


	# Step 4 (if needed): assess the model----------
	tibble(
	"Fitted"=lin_model_quad$fitted.values,
	"Residuals"=lin_model_quad$residuals) %>%
	ggplot(aes(x=Fitted, y=Residuals)) +
	geom_hline(yintercept = 0) +
	geom_point() + theme_bw()

	# If you omitted the quadratic term:
	lin_model_lin <- lm(y ~x1 + log(x2), data = reg_data)
	tibble(
	"Fitted"=lin_model_lin$fitted.values,
	"Residuals"=lin_model_lin$residuals) %>%
	ggplot(aes(x=Fitted, y=Residuals)) +
	geom_hline(yintercept = 0) +
	geom_point() + theme_bw()


	# Hint: Good to have those prepared;)
	ggplot(data = , aes(x=, y=)) +
	geom_point() + theme_bw()

	tibble("Fitted"=, "Residuals"=) %>%
	ggplot(aes(x=Fitted, y=Residuals)) +
	geom_hline(yintercept = 0) +
	geom_point() + theme_bw()
	here::i_am("inclass-wrangling-solutions.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)

	# In class solution for data wrangling----------------
	raw_data <- data.table::fread("data/wrangel_1.csv", header = TRUE) %>%
	as_tibble()
	head(raw_data)

	tidy_data <- raw_data %>%
	tidyr::pivot_longer(
	cols = -all_of(c("country", "name")),
	# cols = as.character(seq(2005, 2020)),
	names_to = "year",
	values_to = "observation") %>%
	tidyr::pivot_wider(
	names_from = "name",
	values_from = "observation")
	head(tidy_data)

	mean_data <- tidy_data %>%
	group_by(country) %>%
	summarise(
	GrowthMean = mean(Growth, na.rm = TRUE),
	EducSpendingMean = mean(EducationSpending, na.rm = TRUE),
	HealthSpendingMean = mean(HealthSpending, na.rm = TRUE)
	)
	mean_data

	# Alternative:
	mean_data_alt <- tidy_data %>%
	pivot_longer(cols = -c("country", "year")) %>%
	# pivot_longer(cols = c("Growth", "EducationSpending", "HealthSpending")) %>%
	summarise(MeanVal = mean(value, na.rm = TRUE), .by = c("country", "name")) %>%
	pivot_wider(names_from = "country", values_from = "MeanVal")
	mean_data_alt

	# Alternative:
	mean_data_alt2 <- tidy_data %>%
	summarise(
	across(
	all_of(c("Growth", "EducationSpending", "HealthSpending")),
	.fns = ~ mean(.x, na.rm = TRUE)),
	.by = "country")
	mean_data_alt2

	# Extension:
	# Compute, for each country, the percentage change of the spending on education
	# from the year 2010 to the year 2018 and save this as a variable called
	# perc_change.
	perc_data <- tidy_data %>%
	select(c("country", "year", "EducationSpending")) %>%
	dplyr::filter(year %in% c(2010, 2018)) %>%
	pivot_wider(
	names_from = "year",
	values_from = "EducationSpending"
	) %>%
	dplyr::mutate(perc_change =(`2018` - `2010`)/`2010`*100)