graebnerc/#T8: Lecture notes

## #T8: Lecture notes
Lecture notes and solutions to the exercises of session 8 on data wrangling

## T8-Exercise-1-Solution.R
here::i_am("R/exercise_1_solution.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)

file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)

ex1_data_filtered <- ex1_data %>%
  dplyr::filter(
    country %in% c("Germany", "Greece"),
    year %in% seq(1995, 2015),
    year >= 1995, year <= 2015 # equivalent to row above
    )

ex1_data_filtered_long <- ex1_data_filtered %>%
  tidyr::pivot_longer(
    cols = c("gdp", "co2"),
    names_to = "indicator",
    values_to = "values")

## T8-Exercise-2-Solution.R
here::i_am("R/exercise_2_solution.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus

# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)

# Wrangle the data---------------------
ex2_data_final <- ex2_data %>%
  dplyr::select(
    dplyr::all_of(c("country", "year", "gdp", "share_indus", "co2"))
    # or: -dplyr::all_of(c("unemp"))
  ) %>%
  dplyr::mutate(share_indus=share_indus/100) %>%
  dplyr::filter(year>=2010, year<=2018) %>%
  pivot_longer(
    cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
    names_to = "indicator",
    values_to = "value") %>%
  dplyr::group_by(country, indicator) %>%
  dplyr::summarise(
    time_avg=mean(value, na.rm=TRUE),
    .groups = "drop" # Not strictly necessary, but good practice
    )

# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
  dplyr::filter(indicator=="co2") %>%
  ggplot2::ggplot(
    data = .,
    mapping = aes(x=indicator,
                  y = time_avg,
                  color=country,
                  fill=country)
    ) +
  geom_bar(
    stat = "identity",
    position = position_dodge(),
    alpha=0.75) +
  theme_bw() +
  labs(
    title = "Average CO2 emissions (2010-2018)",
    y = "avg. emissions per capita",
    caption = "Data: World Bank.") +
  scale_y_continuous(expand = expansion()) +
  scale_fill_brewer(
    palette = "Set1", aesthetics = c("color", "fill")) +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()
  )
ggsave(plot = ex2_plot,
       filename = here("output/T8-Exercise2.pdf"),
       width = 4, height = 3)

## T8-Lecture-Notes.R
here::i_am("R/T8-Session-Notes.R") # Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)

# This is the script used during the lecture. For a more extensive and commented
# version see the lecture notes on the course homepage

# Read in data (all available via the course homepage)
gini_red <- fread(
  file = here("data/raw/wrangling_slides_gini.csv"))
data_raw <- fread(
  file = here("data/raw/wrangling_slides.csv"))
data_raw_long <- fread(
  file = here("data/raw/wrangling_slides_long.csv"))
data_final_expl <- fread(
  file = here("data/raw/wrangling_slides_final_expl.csv"))
swiid_join <- fread(
  file = here("data/raw/wrangling_slides_gini_grc.csv"))

# 1. Reshaping data from long to wide
# 1.1. Long to wide data
data_raw_long
data_raw_wide <- tidyr::pivot_wider(
  data = data_raw_long,
  names_from = "variable",
  values_from = "value")
data_raw_wide

# 1.2. Wide to long data
data_raw_long_2 <- tidyr::pivot_longer(
  data = data_raw_wide,
  cols = all_of(c("unemp", "gdp", "gini")),
  names_to = "new_variable",
  values_to = "new_values")
data_raw_long_2

data_raw_long_3 <- tidyr::pivot_longer(
  data = data_raw_wide,
  cols = starts_with("g"),
  names_to = "new_variable",
  values_to = "new_values")
data_raw_long_3

# 2. Chaining wrangling tasks using pipes
chain_1 <- tidyr::pivot_longer(
  data = pipe_data_raw,
  cols = c("gdp", "unemp"),
  names_to = "indicator",
  values_to = "val")

chain_2 <- tidyr::pivot_wider(
  data = chain_1,
  names_from = "year",
  values_from = "val")

chain_complete <- pipe_data_raw %>%
  tidyr::pivot_longer(
    data = .,
    cols = c("gdp", "unemp"),
    names_to = "indicator",
    values_to = "val") %>%
  tidyr::pivot_wider(
    data = .,
    names_from = "year",
    values_from = "val")
chain_complete

# Without dots:
chain_complete <- pipe_data_raw %>%
  tidyr::pivot_longer(
    cols = c("gdp", "unemp"),
    names_to = "indicator",
    values_to = "val") %>%
  tidyr::pivot_wider(
    names_from = "year",
    values_from = "val")
chain_complete

# 3. Filtering rows

data_raw_long_ger <- data_raw_long %>%
  dplyr::filter(
    country == "Germany",
    country %in% c("Germany"), # equivalent to row above
    variable %in% c("unemp", "gdp"),
    variable == "unemp" | variable == "gdp" # equivalent to row above
    )
data_raw_long_ger

# Exercise 1
# See solution online:
# https://gist.github.com/graebnerc/f635b4e3cfcbcc01b4511f26d0561251

# 4. Creating of manipulating variables
# TBA

# 5. Selecting columns
# TBA

# 6. Grouping and summarising data
# TBA

# Exercise 2
# TBA
	here::i_am("R/exercise_1_solution.R") # Adjust to your folder structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(tibble)
	library(data.table)

	file_path <- here::here("data/raw/exercise_1.csv")
	ex1_data <- data.table::fread(file = file_path)
	ex1_data <- tibble::as_tibble(ex1_data)

	ex1_data_filtered <- ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	)

	ex1_data_filtered_long <- ex1_data_filtered %>%
	tidyr::pivot_longer(
	cols = c("gdp", "co2"),
	names_to = "indicator",
	values_to = "values")
	here::i_am("R/exercise_2_solution.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(ggplot2) # only for the bonus

	# Import the data----------------------
	file_path <- here::here("data/raw/exercise_2.csv")
	ex2_data <- data.table::fread(file = file_path)
	ex2_data <- tibble::as_tibble(ex2_data)

	# Wrangle the data---------------------
	ex2_data_final <- ex2_data %>%
	dplyr::select(
	dplyr::all_of(c("country", "year", "gdp", "share_indus", "co2"))
	# or: -dplyr::all_of(c("unemp"))
	) %>%
	dplyr::mutate(share_indus=share_indus/100) %>%
	dplyr::filter(year>=2010, year<=2018) %>%
	pivot_longer(
	cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
	names_to = "indicator",
	values_to = "value") %>%
	dplyr::group_by(country, indicator) %>%
	dplyr::summarise(
	time_avg=mean(value, na.rm=TRUE),
	.groups = "drop" # Not strictly necessary, but good practice
	)

	# Bonus: make a plot from the data-----
	ex2_plot <- ex2_data_final %>%
	dplyr::filter(indicator=="co2") %>%
	ggplot2::ggplot(
	data = .,
	mapping = aes(x=indicator,
	y = time_avg,
	color=country,
	fill=country)
	) +
	geom_bar(
	stat = "identity",
	position = position_dodge(),
	alpha=0.75) +
	theme_bw() +
	labs(
	title = "Average CO2 emissions (2010-2018)",
	y = "avg. emissions per capita",
	caption = "Data: World Bank.") +
	scale_y_continuous(expand = expansion()) +
	scale_fill_brewer(
	palette = "Set1", aesthetics = c("color", "fill")) +
	theme(
	legend.title = element_blank(),
	legend.position = "bottom",
	axis.title.x = element_blank(),
	axis.ticks.x = element_blank(),
	axis.text.x = element_blank()
	)
	ggsave(plot = ex2_plot,
	filename = here("output/T8-Exercise2.pdf"),
	width = 4, height = 3)
	here::i_am("R/T8-Session-Notes.R") # Adjust to your directory structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)

	# This is the script used during the lecture. For a more extensive and commented
	# version see the lecture notes on the course homepage

	# Read in data (all available via the course homepage)
	gini_red <- fread(
	file = here("data/raw/wrangling_slides_gini.csv"))
	data_raw <- fread(
	file = here("data/raw/wrangling_slides.csv"))
	data_raw_long <- fread(
	file = here("data/raw/wrangling_slides_long.csv"))
	data_final_expl <- fread(
	file = here("data/raw/wrangling_slides_final_expl.csv"))
	swiid_join <- fread(
	file = here("data/raw/wrangling_slides_gini_grc.csv"))

	# 1. Reshaping data from long to wide
	# 1.1. Long to wide data
	data_raw_long
	data_raw_wide <- tidyr::pivot_wider(
	data = data_raw_long,
	names_from = "variable",
	values_from = "value")
	data_raw_wide

	# 1.2. Wide to long data
	data_raw_long_2 <- tidyr::pivot_longer(
	data = data_raw_wide,
	cols = all_of(c("unemp", "gdp", "gini")),
	names_to = "new_variable",
	values_to = "new_values")
	data_raw_long_2

	data_raw_long_3 <- tidyr::pivot_longer(
	data = data_raw_wide,
	cols = starts_with("g"),
	names_to = "new_variable",
	values_to = "new_values")
	data_raw_long_3

	# 2. Chaining wrangling tasks using pipes
	chain_1 <- tidyr::pivot_longer(
	data = pipe_data_raw,
	cols = c("gdp", "unemp"),
	names_to = "indicator",
	values_to = "val")

	chain_2 <- tidyr::pivot_wider(
	data = chain_1,
	names_from = "year",
	values_from = "val")

	chain_complete <- pipe_data_raw %>%
	tidyr::pivot_longer(
	data = .,
	cols = c("gdp", "unemp"),
	names_to = "indicator",
	values_to = "val") %>%
	tidyr::pivot_wider(
	data = .,
	names_from = "year",
	values_from = "val")
	chain_complete

	# Without dots:
	chain_complete <- pipe_data_raw %>%
	tidyr::pivot_longer(
	cols = c("gdp", "unemp"),
	names_to = "indicator",
	values_to = "val") %>%
	tidyr::pivot_wider(
	names_from = "year",
	values_from = "val")
	chain_complete

	# 3. Filtering rows

	data_raw_long_ger <- data_raw_long %>%
	dplyr::filter(
	country == "Germany",
	country %in% c("Germany"), # equivalent to row above
	variable %in% c("unemp", "gdp"),
	variable == "unemp" \| variable == "gdp" # equivalent to row above
	)
	data_raw_long_ger

	# Exercise 1
	# See solution online:
	# https://gist.github.com/graebnerc/f635b4e3cfcbcc01b4511f26d0561251

	# 4. Creating of manipulating variables
	# TBA

	# 5. Selecting columns
	# TBA

	# 6. Grouping and summarising data
	# TBA

	# Exercise 2
	# TBA