graebnerc/# Data Preparation: solutions

## # Data Preparation: solutions
Solutions for the exercises in the session on data preparation.

## DataPrep-Exercise-1.R
here::i_am("R/DataPrep-Exercise-1.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)

file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)

ex1_data_filtered <- ex1_data %>%
  dplyr::filter(
    country %in% c("Germany", "Greece"),
    year %in% seq(1995, 2015),
    year >= 1995, year <= 2015 # equivalent to row above
  )

ex1_data_tidy <- ex1_data_filtered %>%
  tidyr::pivot_wider(
    names_from = "indicator",
    values_from = "values")

tidy_path <- "data/tidy/ex1_solution.csv"
data.table::fwrite(x = ex1_data_tidy, file = tidy_path)

# Alternative formulation combining filtering and pivoting using the pipe:

ex1_data %>%
  dplyr::filter(
    country %in% c("Germany", "Greece"),
    year %in% seq(1995, 2015),
    year >= 1995, year <= 2015 # equivalent to row above
  ) %>%
  tidyr::pivot_wider(
    names_from = "indicator",
    values_from = "values")


## DataPrep-Exercise-2.R
here::i_am("R/DataPrep-Exercise-2.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus

# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)

# Wrangle the data---------------------
cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")

ex2_data_final <- ex2_data %>%
  dplyr::select(
    dplyr::all_of(cols_to_keep)
    # or: -dplyr::all_of(c("unemp"))
  ) %>%
  dplyr::mutate(share_indus=share_indus/100) %>%
  dplyr::filter(year>=2010, year<=2018) %>%
  pivot_longer(
    cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
    names_to = "indicator",
    values_to = "value") %>%
  dplyr::group_by(country, indicator) %>%
  dplyr::summarise(
    time_avg=mean(value, na.rm=TRUE),
    .groups = "drop" # Not strictly necessary, but good practice
  )

tidy_path <- "data/tidy/ex2_solution.csv"
data.table::fwrite(x = ex2_data_final, file = tidy_path)

# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
  dplyr::filter(indicator=="co2") %>%
  ggplot2::ggplot(
    data = .,
    mapping = aes(x=indicator,
                  y = time_avg,
                  color=country,
                  fill=country)
  ) +
  geom_bar(
    stat = "identity",
    position = position_dodge(),
    alpha=0.75) +
  theme_bw() +
  labs(
    title = "Average CO2 emissions (2010-2018)",
    y = "avg. emissions per capita",
    caption = "Data: World Bank.") +
  scale_y_continuous(expand = expansion()) +
  scale_fill_brewer(
    palette = "Set1", aesthetics = c("color", "fill")) +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()
  )

ggsave(plot = ex2_plot,
       filename = here("output/ex2_solution.pdf"),
       width = 4, height = 3)

## DataPrep-IntermediateExs.R
here::i_am("R/DataPrep-Intermediate-Exercises.R")
# Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(DataScienceExercises)

# Data is available via the course homepage.

# 1. Short recap on reshaping------------
data_raw_long <- fread(
  file = here("data/raw/data_raw_long.csv"))
data_raw_long <- tibble::as_tibble(data_raw_long)
head(data_raw_long)

data_raw_long_w <- tidyr::pivot_wider(
  data = data_raw_long,
  names_from = "country",
  values_from = "value")
data_raw_long_w

data_raw_wide <- fread(
  file = here("data/raw/data_raw_wide.csv"), header = TRUE)
data_raw_wide <- tibble::as_tibble(data_raw_wide)
head(data_raw_wide)

data_raw_wide_l <- data_raw_wide %>%
  tidyr::pivot_longer(
    cols = -country,
    names_to = "year",
    values_to = "gini")

# 2. Short recap on manipulation basics------------
wine_data <- tibble::as_tibble(DataScienceExercises::wine2dine)

# Filter the data set such that it only contains white wines
wine_1 <- wine_data %>%
  dplyr::filter(kind=="white")

# Then remove the column 'kind'
wine_2 <- wine_1 %>%
  dplyr::select(-kind)

# Change the type of the column 'quality' into double
wine_3 <- wine_2 %>%
  dplyr::mutate(quality=as.double(quality))

# Divide the values in the columns 'alcohol' and 'residual sugar' by 100
wine_4 <- wine_3 %>%
  dplyr::mutate(
    alcohol = alcohol/100,
    `residual sugar` = `residual sugar`/100
  )

# Filter the data such that you only keep the wines with the highest quality score
highest_quality <- max(wine_4$quality)
wine_5 <- wine_4 %>%
  dplyr::filter(quality >= highest_quality)


# 3. Short recap on summarizing and grouping------------
# Summarise the data by computing the mean alcohol, mean sugar,
#  and mean quality of white and red wines

## Alternative 1:
wine_summary_1 <- wine_data %>%
  dplyr::group_by(kind) %>%
  dplyr::summarise(
    alc_mean = mean(alcohol),
    sugar_mean = mean(`residual sugar`),
    qual_mean = mean(quality))
wine_summary_1

## Alternative 2:
wine_summary_1 <- wine_data %>%
  dplyr::summarise(
    alc_mean = mean(alcohol),
    sugar_mean = mean(`residual sugar`),
    qual_mean = mean(quality), .by = "kind")
wine_summary_1

## Alternative 3:
wine_summary_1 <- wine_data %>%
  dplyr::summarise(across(.cols = everything(), .fns = mean), .by = "kind")
wine_summary_1

# Compute a variable indicating how the quality of each wine
#  deviates from the average quality of all wines.

wine_summary_2 <- wine_data %>%
  dplyr::mutate(
    average_quality = mean(quality),
    quality_deviation = quality - average_quality
    )

# 4. Short recap on joining data sets------------
join_x <- tibble::as_tibble(data.table::fread(here("data/raw/join_x.csv")))
join_y <- tibble::as_tibble(data.table::fread(here("data/raw/join_y.csv")))

# Try for yourself what the function inner_join() does.
#  How does it differ from left_join(), right_join(), and full_join()?

xy_inner <- dplyr::inner_join(x = join_x, y = join_y, by = c("time", "id"))
# Only keeps rows where there are observations in both data sets -> avoids NA
#  In the present case, it produces an empty tibble; for more info check the tutorial

# Consider the data sets join_x.csv and join_y.csv and the function
#  dplyr::full_join(). What is the difference of joining on columns time and
#  id vs joining only on column id?
xy_full_t_id <- dplyr::full_join(x = join_x, y = join_y, by = c("time", "id"))
xy_full_id <- dplyr::full_join(x = join_x, y = join_y, by = c("id"))
xy_full_t_id
xy_full_id


# 5. Short recap on piping------------

pipedata_v1 <- data.table::fread(here("data/raw/recap-pipes.csv"))

pipedata_v2 <- tidyr::pivot_longer(
  data = pipedata_v1,
  cols = c("lifeExp", "gdpPercap"),
  names_to = "Indicator",
  values_to = "Value")

pipedata_v3 <- tidyr::pivot_wider(
  data = pipedata_v2,
  names_from = "year",
  values_from = "Value")

# Piped version:

pipe_data_final <- data.table::fread(here("data/raw/recap-pipes.csv")) %>%
  tidyr::pivot_longer(
    cols = c("lifeExp", "gdpPercap"),
    names_to = "Indicator",
    values_to = "Value") %>%
  tidyr::pivot_wider(
    names_from = "year",
    values_from = "Value")
	here::i_am("R/DataPrep-Exercise-1.R") # Adjust to your folder structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(tibble)
	library(data.table)

	file_path <- here::here("data/raw/exercise_1.csv")
	ex1_data <- data.table::fread(file = file_path)
	ex1_data <- tibble::as_tibble(ex1_data)

	ex1_data_filtered <- ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	)

	ex1_data_tidy <- ex1_data_filtered %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")

	tidy_path <- "data/tidy/ex1_solution.csv"
	data.table::fwrite(x = ex1_data_tidy, file = tidy_path)

	# Alternative formulation combining filtering and pivoting using the pipe:

	ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	) %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")
	here::i_am("R/DataPrep-Exercise-2.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(ggplot2) # only for the bonus

	# Import the data----------------------
	file_path <- here::here("data/raw/exercise_2.csv")
	ex2_data <- data.table::fread(file = file_path)
	ex2_data <- tibble::as_tibble(ex2_data)

	# Wrangle the data---------------------
	cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")

	ex2_data_final <- ex2_data %>%
	dplyr::select(
	dplyr::all_of(cols_to_keep)
	# or: -dplyr::all_of(c("unemp"))
	) %>%
	dplyr::mutate(share_indus=share_indus/100) %>%
	dplyr::filter(year>=2010, year<=2018) %>%
	pivot_longer(
	cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
	names_to = "indicator",
	values_to = "value") %>%
	dplyr::group_by(country, indicator) %>%
	dplyr::summarise(
	time_avg=mean(value, na.rm=TRUE),
	.groups = "drop" # Not strictly necessary, but good practice
	)

	tidy_path <- "data/tidy/ex2_solution.csv"
	data.table::fwrite(x = ex2_data_final, file = tidy_path)

	# Bonus: make a plot from the data-----
	ex2_plot <- ex2_data_final %>%
	dplyr::filter(indicator=="co2") %>%
	ggplot2::ggplot(
	data = .,
	mapping = aes(x=indicator,
	y = time_avg,
	color=country,
	fill=country)
	) +
	geom_bar(
	stat = "identity",
	position = position_dodge(),
	alpha=0.75) +
	theme_bw() +
	labs(
	title = "Average CO2 emissions (2010-2018)",
	y = "avg. emissions per capita",
	caption = "Data: World Bank.") +
	scale_y_continuous(expand = expansion()) +
	scale_fill_brewer(
	palette = "Set1", aesthetics = c("color", "fill")) +
	theme(
	legend.title = element_blank(),
	legend.position = "bottom",
	axis.title.x = element_blank(),
	axis.ticks.x = element_blank(),
	axis.text.x = element_blank()
	)

	ggsave(plot = ex2_plot,
	filename = here("output/ex2_solution.pdf"),
	width = 4, height = 3)
	here::i_am("R/DataPrep-Intermediate-Exercises.R")
	# Adjust to your directory structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(DataScienceExercises)

	# Data is available via the course homepage.

	# 1. Short recap on reshaping------------
	data_raw_long <- fread(
	file = here("data/raw/data_raw_long.csv"))
	data_raw_long <- tibble::as_tibble(data_raw_long)
	head(data_raw_long)

	data_raw_long_w <- tidyr::pivot_wider(
	data = data_raw_long,
	names_from = "country",
	values_from = "value")
	data_raw_long_w

	data_raw_wide <- fread(
	file = here("data/raw/data_raw_wide.csv"), header = TRUE)
	data_raw_wide <- tibble::as_tibble(data_raw_wide)
	head(data_raw_wide)

	data_raw_wide_l <- data_raw_wide %>%
	tidyr::pivot_longer(
	cols = -country,
	names_to = "year",
	values_to = "gini")

	# 2. Short recap on manipulation basics------------
	wine_data <- tibble::as_tibble(DataScienceExercises::wine2dine)

	# Filter the data set such that it only contains white wines
	wine_1 <- wine_data %>%
	dplyr::filter(kind=="white")

	# Then remove the column 'kind'
	wine_2 <- wine_1 %>%
	dplyr::select(-kind)

	# Change the type of the column 'quality' into double
	wine_3 <- wine_2 %>%
	dplyr::mutate(quality=as.double(quality))

	# Divide the values in the columns 'alcohol' and 'residual sugar' by 100
	wine_4 <- wine_3 %>%
	dplyr::mutate(
	alcohol = alcohol/100,
	`residual sugar` = `residual sugar`/100
	)

	# Filter the data such that you only keep the wines with the highest quality score
	highest_quality <- max(wine_4$quality)
	wine_5 <- wine_4 %>%
	dplyr::filter(quality >= highest_quality)


	# 3. Short recap on summarizing and grouping------------
	# Summarise the data by computing the mean alcohol, mean sugar,
	# and mean quality of white and red wines

	## Alternative 1:
	wine_summary_1 <- wine_data %>%
	dplyr::group_by(kind) %>%
	dplyr::summarise(
	alc_mean = mean(alcohol),
	sugar_mean = mean(`residual sugar`),
	qual_mean = mean(quality))
	wine_summary_1

	## Alternative 2:
	wine_summary_1 <- wine_data %>%
	dplyr::summarise(
	alc_mean = mean(alcohol),
	sugar_mean = mean(`residual sugar`),
	qual_mean = mean(quality), .by = "kind")
	wine_summary_1

	## Alternative 3:
	wine_summary_1 <- wine_data %>%
	dplyr::summarise(across(.cols = everything(), .fns = mean), .by = "kind")
	wine_summary_1

	# Compute a variable indicating how the quality of each wine
	# deviates from the average quality of all wines.

	wine_summary_2 <- wine_data %>%
	dplyr::mutate(
	average_quality = mean(quality),
	quality_deviation = quality - average_quality
	)

	# 4. Short recap on joining data sets------------
	join_x <- tibble::as_tibble(data.table::fread(here("data/raw/join_x.csv")))
	join_y <- tibble::as_tibble(data.table::fread(here("data/raw/join_y.csv")))

	# Try for yourself what the function inner_join() does.
	# How does it differ from left_join(), right_join(), and full_join()?

	xy_inner <- dplyr::inner_join(x = join_x, y = join_y, by = c("time", "id"))
	# Only keeps rows where there are observations in both data sets -> avoids NA
	# In the present case, it produces an empty tibble; for more info check the tutorial

	# Consider the data sets join_x.csv and join_y.csv and the function
	# dplyr::full_join(). What is the difference of joining on columns time and
	# id vs joining only on column id?
	xy_full_t_id <- dplyr::full_join(x = join_x, y = join_y, by = c("time", "id"))
	xy_full_id <- dplyr::full_join(x = join_x, y = join_y, by = c("id"))
	xy_full_t_id
	xy_full_id


	# 5. Short recap on piping------------

	pipedata_v1 <- data.table::fread(here("data/raw/recap-pipes.csv"))

	pipedata_v2 <- tidyr::pivot_longer(
	data = pipedata_v1,
	cols = c("lifeExp", "gdpPercap"),
	names_to = "Indicator",
	values_to = "Value")

	pipedata_v3 <- tidyr::pivot_wider(
	data = pipedata_v2,
	names_from = "year",
	values_from = "Value")

	# Piped version:

	pipe_data_final <- data.table::fread(here("data/raw/recap-pipes.csv")) %>%
	tidyr::pivot_longer(
	cols = c("lifeExp", "gdpPercap"),
	names_to = "Indicator",
	values_to = "Value") %>%
	tidyr::pivot_wider(
	names_from = "year",
	values_from = "Value")