Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active May 6, 2024 19:33
Show Gist options
  • Save graebnerc/029ca3b65025478f19f75ebfda2c1292 to your computer and use it in GitHub Desktop.
Save graebnerc/029ca3b65025478f19f75ebfda2c1292 to your computer and use it in GitHub Desktop.
Solutions for the exercises in the session on data preparation.
Solutions for the exercises in the session on data preparation.
here::i_am("R/DataPrep-Exercise-1.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)
file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)
ex1_data_filtered <- ex1_data %>%
dplyr::filter(
country %in% c("Germany", "Greece"),
year %in% seq(1995, 2015),
year >= 1995, year <= 2015 # equivalent to row above
)
ex1_data_tidy <- ex1_data_filtered %>%
tidyr::pivot_wider(
names_from = "indicator",
values_from = "values")
tidy_path <- "data/tidy/ex1_solution.csv"
data.table::fwrite(x = ex1_data_tidy, file = tidy_path)
# Alternative formulation combining filtering and pivoting using the pipe:
ex1_data %>%
dplyr::filter(
country %in% c("Germany", "Greece"),
year %in% seq(1995, 2015),
year >= 1995, year <= 2015 # equivalent to row above
) %>%
tidyr::pivot_wider(
names_from = "indicator",
values_from = "values")
here::i_am("R/DataPrep-Exercise-2.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus
# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)
# Wrangle the data---------------------
cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")
ex2_data_final <- ex2_data %>%
dplyr::select(
dplyr::all_of(cols_to_keep)
# or: -dplyr::all_of(c("unemp"))
) %>%
dplyr::mutate(share_indus=share_indus/100) %>%
dplyr::filter(year>=2010, year<=2018) %>%
pivot_longer(
cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
names_to = "indicator",
values_to = "value") %>%
dplyr::group_by(country, indicator) %>%
dplyr::summarise(
time_avg=mean(value, na.rm=TRUE),
.groups = "drop" # Not strictly necessary, but good practice
)
tidy_path <- "data/tidy/ex2_solution.csv"
data.table::fwrite(x = ex2_data_final, file = tidy_path)
# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
dplyr::filter(indicator=="co2") %>%
ggplot2::ggplot(
data = .,
mapping = aes(x=indicator,
y = time_avg,
color=country,
fill=country)
) +
geom_bar(
stat = "identity",
position = position_dodge(),
alpha=0.75) +
theme_bw() +
labs(
title = "Average CO2 emissions (2010-2018)",
y = "avg. emissions per capita",
caption = "Data: World Bank.") +
scale_y_continuous(expand = expansion()) +
scale_fill_brewer(
palette = "Set1", aesthetics = c("color", "fill")) +
theme(
legend.title = element_blank(),
legend.position = "bottom",
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank()
)
ggsave(plot = ex2_plot,
filename = here("output/ex2_solution.pdf"),
width = 4, height = 3)
here::i_am("R/DataPrep-Intermediate-Exercises.R")
# Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(DataScienceExercises)
# Data is available via the course homepage.
# 1. Short recap on reshaping------------
data_raw_long <- fread(
file = here("data/raw/data_raw_long.csv"))
data_raw_long <- tibble::as_tibble(data_raw_long)
head(data_raw_long)
data_raw_long_w <- tidyr::pivot_wider(
data = data_raw_long,
names_from = "country",
values_from = "value")
data_raw_long_w
data_raw_wide <- fread(
file = here("data/raw/data_raw_wide.csv"), header = TRUE)
data_raw_wide <- tibble::as_tibble(data_raw_wide)
head(data_raw_wide)
data_raw_wide_l <- data_raw_wide %>%
tidyr::pivot_longer(
cols = -country,
names_to = "year",
values_to = "gini")
# 2. Short recap on manipulation basics------------
wine_data <- tibble::as_tibble(DataScienceExercises::wine2dine)
# Filter the data set such that it only contains white wines
wine_1 <- wine_data %>%
dplyr::filter(kind=="white")
# Then remove the column 'kind'
wine_2 <- wine_1 %>%
dplyr::select(-kind)
# Change the type of the column 'quality' into double
wine_3 <- wine_2 %>%
dplyr::mutate(quality=as.double(quality))
# Divide the values in the columns 'alcohol' and 'residual sugar' by 100
wine_4 <- wine_3 %>%
dplyr::mutate(
alcohol = alcohol/100,
`residual sugar` = `residual sugar`/100
)
# Filter the data such that you only keep the wines with the highest quality score
highest_quality <- max(wine_4$quality)
wine_5 <- wine_4 %>%
dplyr::filter(quality >= highest_quality)
# 3. Short recap on summarizing and grouping------------
# Summarise the data by computing the mean alcohol, mean sugar,
# and mean quality of white and red wines
## Alternative 1:
wine_summary_1 <- wine_data %>%
dplyr::group_by(kind) %>%
dplyr::summarise(
alc_mean = mean(alcohol),
sugar_mean = mean(`residual sugar`),
qual_mean = mean(quality))
wine_summary_1
## Alternative 2:
wine_summary_1 <- wine_data %>%
dplyr::summarise(
alc_mean = mean(alcohol),
sugar_mean = mean(`residual sugar`),
qual_mean = mean(quality), .by = "kind")
wine_summary_1
## Alternative 3:
wine_summary_1 <- wine_data %>%
dplyr::summarise(across(.cols = everything(), .fns = mean), .by = "kind")
wine_summary_1
# Compute a variable indicating how the quality of each wine
# deviates from the average quality of all wines.
wine_summary_2 <- wine_data %>%
dplyr::mutate(
average_quality = mean(quality),
quality_deviation = quality - average_quality
)
# 4. Short recap on joining data sets------------
join_x <- tibble::as_tibble(data.table::fread(here("data/raw/join_x.csv")))
join_y <- tibble::as_tibble(data.table::fread(here("data/raw/join_y.csv")))
# Try for yourself what the function inner_join() does.
# How does it differ from left_join(), right_join(), and full_join()?
xy_inner <- dplyr::inner_join(x = join_x, y = join_y, by = c("time", "id"))
# Only keeps rows where there are observations in both data sets -> avoids NA
# In the present case, it produces an empty tibble; for more info check the tutorial
# Consider the data sets join_x.csv and join_y.csv and the function
# dplyr::full_join(). What is the difference of joining on columns time and
# id vs joining only on column id?
xy_full_t_id <- dplyr::full_join(x = join_x, y = join_y, by = c("time", "id"))
xy_full_id <- dplyr::full_join(x = join_x, y = join_y, by = c("id"))
xy_full_t_id
xy_full_id
# 5. Short recap on piping------------
pipedata_v1 <- data.table::fread(here("data/raw/recap-pipes.csv"))
pipedata_v2 <- tidyr::pivot_longer(
data = pipedata_v1,
cols = c("lifeExp", "gdpPercap"),
names_to = "Indicator",
values_to = "Value")
pipedata_v3 <- tidyr::pivot_wider(
data = pipedata_v2,
names_from = "year",
values_from = "Value")
# Piped version:
pipe_data_final <- data.table::fread(here("data/raw/recap-pipes.csv")) %>%
tidyr::pivot_longer(
cols = c("lifeExp", "gdpPercap"),
names_to = "Indicator",
values_to = "Value") %>%
tidyr::pivot_wider(
names_from = "year",
values_from = "Value")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment