graebnerc/# T8: Lecture script and exercise solutions

## # T8: Lecture script and exercise solutions
Contains the script used in the lecture on data wrangling and solutions to the exercises (in the fall semester 2022/23).
For a more extensive and commented version see the lecture notes on the course homepage.
The data is also available via the course homepage.

## T8-Exercise-1-solution.R
here::i_am("R/T8-Exercise-1-solution.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)

file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)

ex1_data_filtered <- ex1_data %>%
  dplyr::filter(
    country %in% c("Germany", "Greece"),
    year %in% seq(1995, 2015),
    year >= 1995, year <= 2015 # equivalent to row above
  )

ex1_data_tidy <- ex1_data_filtered %>%
  tidyr::pivot_wider(
    names_from = "indicator",
    values_from = "values")

tidy_path <- "data/tidy/ex1_solution.csv"
data.table::fwrite(x = ex1_data_tidy, file = tidy_path)

# Alternative formulation combining filtering and pivoting using the pipe:

ex1_data %>%
  dplyr::filter(
    country %in% c("Germany", "Greece"),
    year %in% seq(1995, 2015),
    year >= 1995, year <= 2015 # equivalent to row above
  ) %>%
  tidyr::pivot_wider(
    names_from = "indicator",
    values_from = "values")

## T8-Exercise-2-Solution.R
here::i_am("R/T8-Exercise-2-Solution.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus

# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)

# Wrangle the data---------------------
cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")

ex2_data_final <- ex2_data %>%
  dplyr::select(
    dplyr::all_of(cols_to_keep)
    # or: -dplyr::all_of(c("unemp"))
  ) %>%
  dplyr::mutate(share_indus=share_indus/100) %>%
  dplyr::filter(year>=2010, year<=2018) %>%
  pivot_longer(
    cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
    names_to = "indicator",
    values_to = "value") %>%
  dplyr::group_by(country, indicator) %>%
  dplyr::summarise(
    time_avg=mean(value, na.rm=TRUE),
    .groups = "drop" # Not strictly necessary, but good practice
    )

# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
  dplyr::filter(indicator=="co2") %>%
  ggplot2::ggplot(
    data = .,
    mapping = aes(x=indicator,
                  y = time_avg,
                  color=country,
                  fill=country)
    ) +
  geom_bar(
    stat = "identity",
    position = position_dodge(),
    alpha=0.75) +
  theme_bw() +
  labs(
    title = "Average CO2 emissions (2010-2018)",
    y = "avg. emissions per capita",
    caption = "Data: World Bank.") +
  scale_y_continuous(expand = expansion()) +
  scale_fill_brewer(
    palette = "Set1", aesthetics = c("color", "fill")) +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()
  )

ggsave(plot = ex2_plot,
       filename = here("output/T8-Exercise2.pdf"),
       width = 4, height = 3)

## T8-SessionNotes.R
here::i_am("R/T8-SessionNotes.R")
# Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)

# This is the script used during the lecture.
# For a more extensive and commented
#  version see the lecture notes on the course homepage.
# Data is available via the course homepage.

# 1. Reshaping data from long to wide------------
data_raw_long <- fread(
  file = here("data/raw/wrangling_slides_long.csv"))
data_raw_long <- tibble::as_tibble(data_raw_long)
head(data_raw_long)

# 1.1. Long to wide data-------------------------
data_raw_wide <- tidyr::pivot_wider(
  data = data_raw_long,
  names_from = "variable",
  values_from = "value")
data_raw_wide

# 1.2. Wide to long data-------------------------
data_raw_long_2 <- tidyr::pivot_longer(
  data = data_raw_wide,
  cols = all_of(c("unemp", "gdp", "gini")),
  names_to = "new_variable",
  values_to = "new_values")
data_raw_long_2

# Digression: use of tidy selection helpers:
data_raw_long_3 <- tidyr::pivot_longer(
  data = data_raw_wide,
  cols = starts_with("g"), # <- tidy selector
  names_to = "new_variable",
  values_to = "new_values")
data_raw_long_3
# More info:
# https://dplyr.tidyverse.org/reference/select.html


# 2. Chaining wrangling tasks using pipes--------

chain_1 <- tidyr::pivot_longer(
  data = data_raw_wide,
  cols = c("gdp", "gini","unemp"),
  names_to = "indicator",
  values_to = "val")

chain_2 <- tidyr::pivot_wider(
  data = chain_1,
  names_from = "year",
  values_from = "val")

chain_complete <- pipe_data_raw %>%
  tidyr::pivot_longer(
    data = .,
    cols = c("gdp", "gini", "unemp"),
    names_to = "indicator",
    values_to = "val") %>%
  tidyr::pivot_wider(
    data = .,
    names_from = "year",
    values_from = "val")
chain_complete

# Without dots:
chain_complete <- pipe_data_raw %>%
  tidyr::pivot_longer(
    cols = c("gdp", "gini", "unemp"),
    names_to = "indicator",
    values_to = "val") %>%
  tidyr::pivot_wider(
    names_from = "year",
    values_from = "val")
chain_complete

# 3. Filtering rows------------------------------

data_raw_long_ger <- data_raw_long %>%
  dplyr::filter(
    country == "Germany",
    country %in% c("Germany"), # equivalent to row above
    variable %in% c("unemp", "gdp"),
    variable == "unemp" | variable == "gdp" # equivalent to row above
  )
data_raw_long_ger

# 4. Selecting columns---------------------------

data_red <- data_raw_wide %>%
  dplyr::select(-all_of(c("unemp", "gini")))

# Equivalent:
data_red <- data_raw_wide %>%
  dplyr::select(all_of(c("country", "year", "gini")))

# Note that you can use selection helpers as above

# 5. Creating or manipulating variables----------

data_red <- data_red %>%
  dplyr::mutate(gdp_thousands = gdp/1000)
data_red

data_red_2 <- data_red %>%
  dplyr::mutate(gdp = gdp/1000)
data_red_2


# 6. Grouping and summarizing data---------------
data_grouped <- data_raw_wide %>%
  dplyr::group_by(country)
data_grouped

data_summarized <- data_raw_wide %>%
  dplyr::group_by(country) %>%
  dplyr::summarise(
    unemp_mean = mean(unemp)
    ) %>%
  dplyr::ungroup()
data_summarized

# Shorter version using .groups = "drop":
data_summarized <- data_raw_wide %>%
  dplyr::group_by(country) %>%
  dplyr::summarise(
    unemp_mean = mean(unemp),
    .groups = "drop" # to remove grouping
  )
data_summarized

# Summarizing several columns using dplyr::across():
data_summarized_full <- data_raw_wide %>%
  dplyr::group_by(country) %>%
  dplyr::summarise(dplyr::across(
      .cols = tidyr::starts_with("g"),
      .fns = ~ mean(.x, na.rm=TRUE)),
    .groups = "drop" # to remove grouping
  )
data_summarized_full

# 7. Merging data sets---------------------------
swiid_join <- data.table::fread(
  file = here("data/raw/wrangling_slides_join_gini.csv")) %>%
  tibble::as_tibble(.)
swiid_join

gdp_join <- data.table::fread(
  file = here("data/raw/wrangling_slides_join_gdp.csv")) %>%
  tibble::as_tibble(.)
gdp_join

# left_join()
dplyr::left_join(
  x = swiid_join,
  y = gdp_join,
  by = c("country"="country", "year"="year")
  # Alternative: by = c("country", "year")
  )

# right_join()
dplyr::right_join(
  x = swiid_join,
  y = gdp_join,
  by = c("country", "year")
)

# full_join()
dplyr::full_join(
  x = swiid_join,
  y = gdp_join,
  by = c("country", "year")
)

# inner_join()
dplyr::inner_join(
  x = swiid_join,
  y = gdp_join,
  by = c("country", "year")
)
	Contains the script used in the lecture on data wrangling and solutions to the exercises (in the fall semester 2022/23).
	For a more extensive and commented version see the lecture notes on the course homepage.
	The data is also available via the course homepage.
	here::i_am("R/T8-Exercise-1-solution.R") # Adjust to your folder structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(tibble)
	library(data.table)

	file_path <- here::here("data/raw/exercise_1.csv")
	ex1_data <- data.table::fread(file = file_path)
	ex1_data <- tibble::as_tibble(ex1_data)

	ex1_data_filtered <- ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	)

	ex1_data_tidy <- ex1_data_filtered %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")

	tidy_path <- "data/tidy/ex1_solution.csv"
	data.table::fwrite(x = ex1_data_tidy, file = tidy_path)

	# Alternative formulation combining filtering and pivoting using the pipe:

	ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	) %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")
	here::i_am("R/T8-Exercise-2-Solution.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(ggplot2) # only for the bonus

	# Import the data----------------------
	file_path <- here::here("data/raw/exercise_2.csv")
	ex2_data <- data.table::fread(file = file_path)
	ex2_data <- tibble::as_tibble(ex2_data)

	# Wrangle the data---------------------
	cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")

	ex2_data_final <- ex2_data %>%
	dplyr::select(
	dplyr::all_of(cols_to_keep)
	# or: -dplyr::all_of(c("unemp"))
	) %>%
	dplyr::mutate(share_indus=share_indus/100) %>%
	dplyr::filter(year>=2010, year<=2018) %>%
	pivot_longer(
	cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
	names_to = "indicator",
	values_to = "value") %>%
	dplyr::group_by(country, indicator) %>%
	dplyr::summarise(
	time_avg=mean(value, na.rm=TRUE),
	.groups = "drop" # Not strictly necessary, but good practice
	)

	# Bonus: make a plot from the data-----
	ex2_plot <- ex2_data_final %>%
	dplyr::filter(indicator=="co2") %>%
	ggplot2::ggplot(
	data = .,
	mapping = aes(x=indicator,
	y = time_avg,
	color=country,
	fill=country)
	) +
	geom_bar(
	stat = "identity",
	position = position_dodge(),
	alpha=0.75) +
	theme_bw() +
	labs(
	title = "Average CO2 emissions (2010-2018)",
	y = "avg. emissions per capita",
	caption = "Data: World Bank.") +
	scale_y_continuous(expand = expansion()) +
	scale_fill_brewer(
	palette = "Set1", aesthetics = c("color", "fill")) +
	theme(
	legend.title = element_blank(),
	legend.position = "bottom",
	axis.title.x = element_blank(),
	axis.ticks.x = element_blank(),
	axis.text.x = element_blank()
	)

	ggsave(plot = ex2_plot,
	filename = here("output/T8-Exercise2.pdf"),
	width = 4, height = 3)
	here::i_am("R/T8-SessionNotes.R")
	# Adjust to your directory structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)

	# This is the script used during the lecture.
	# For a more extensive and commented
	# version see the lecture notes on the course homepage.
	# Data is available via the course homepage.

	# 1. Reshaping data from long to wide------------
	data_raw_long <- fread(
	file = here("data/raw/wrangling_slides_long.csv"))
	data_raw_long <- tibble::as_tibble(data_raw_long)
	head(data_raw_long)

	# 1.1. Long to wide data-------------------------
	data_raw_wide <- tidyr::pivot_wider(
	data = data_raw_long,
	names_from = "variable",
	values_from = "value")
	data_raw_wide

	# 1.2. Wide to long data-------------------------
	data_raw_long_2 <- tidyr::pivot_longer(
	data = data_raw_wide,
	cols = all_of(c("unemp", "gdp", "gini")),
	names_to = "new_variable",
	values_to = "new_values")
	data_raw_long_2

	# Digression: use of tidy selection helpers:
	data_raw_long_3 <- tidyr::pivot_longer(
	data = data_raw_wide,
	cols = starts_with("g"), # <- tidy selector
	names_to = "new_variable",
	values_to = "new_values")
	data_raw_long_3
	# More info:
	# https://dplyr.tidyverse.org/reference/select.html


	# 2. Chaining wrangling tasks using pipes--------

	chain_1 <- tidyr::pivot_longer(
	data = data_raw_wide,
	cols = c("gdp", "gini","unemp"),
	names_to = "indicator",
	values_to = "val")

	chain_2 <- tidyr::pivot_wider(
	data = chain_1,
	names_from = "year",
	values_from = "val")

	chain_complete <- pipe_data_raw %>%
	tidyr::pivot_longer(
	data = .,
	cols = c("gdp", "gini", "unemp"),
	names_to = "indicator",
	values_to = "val") %>%
	tidyr::pivot_wider(
	data = .,
	names_from = "year",
	values_from = "val")
	chain_complete

	# Without dots:
	chain_complete <- pipe_data_raw %>%
	tidyr::pivot_longer(
	cols = c("gdp", "gini", "unemp"),
	names_to = "indicator",
	values_to = "val") %>%
	tidyr::pivot_wider(
	names_from = "year",
	values_from = "val")
	chain_complete

	# 3. Filtering rows------------------------------

	data_raw_long_ger <- data_raw_long %>%
	dplyr::filter(
	country == "Germany",
	country %in% c("Germany"), # equivalent to row above
	variable %in% c("unemp", "gdp"),
	variable == "unemp" \| variable == "gdp" # equivalent to row above
	)
	data_raw_long_ger

	# 4. Selecting columns---------------------------

	data_red <- data_raw_wide %>%
	dplyr::select(-all_of(c("unemp", "gini")))

	# Equivalent:
	data_red <- data_raw_wide %>%
	dplyr::select(all_of(c("country", "year", "gini")))

	# Note that you can use selection helpers as above

	# 5. Creating or manipulating variables----------

	data_red <- data_red %>%
	dplyr::mutate(gdp_thousands = gdp/1000)
	data_red

	data_red_2 <- data_red %>%
	dplyr::mutate(gdp = gdp/1000)
	data_red_2


	# 6. Grouping and summarizing data---------------
	data_grouped <- data_raw_wide %>%
	dplyr::group_by(country)
	data_grouped

	data_summarized <- data_raw_wide %>%
	dplyr::group_by(country) %>%
	dplyr::summarise(
	unemp_mean = mean(unemp)
	) %>%
	dplyr::ungroup()
	data_summarized

	# Shorter version using .groups = "drop":
	data_summarized <- data_raw_wide %>%
	dplyr::group_by(country) %>%
	dplyr::summarise(
	unemp_mean = mean(unemp),
	.groups = "drop" # to remove grouping
	)
	data_summarized

	# Summarizing several columns using dplyr::across():
	data_summarized_full <- data_raw_wide %>%
	dplyr::group_by(country) %>%
	dplyr::summarise(dplyr::across(
	.cols = tidyr::starts_with("g"),
	.fns = ~ mean(.x, na.rm=TRUE)),
	.groups = "drop" # to remove grouping
	)
	data_summarized_full

	# 7. Merging data sets---------------------------
	swiid_join <- data.table::fread(
	file = here("data/raw/wrangling_slides_join_gini.csv")) %>%
	tibble::as_tibble(.)
	swiid_join

	gdp_join <- data.table::fread(
	file = here("data/raw/wrangling_slides_join_gdp.csv")) %>%
	tibble::as_tibble(.)
	gdp_join

	# left_join()
	dplyr::left_join(
	x = swiid_join,
	y = gdp_join,
	by = c("country"="country", "year"="year")
	# Alternative: by = c("country", "year")
	)

	# right_join()
	dplyr::right_join(
	x = swiid_join,
	y = gdp_join,
	by = c("country", "year")
	)

	# full_join()
	dplyr::full_join(
	x = swiid_join,
	y = gdp_join,
	by = c("country", "year")
	)

	# inner_join()
	dplyr::inner_join(
	x = swiid_join,
	y = gdp_join,
	by = c("country", "year")
	)