Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save graebnerc/8b1eeed370bd3b20be8bf8ff5c1a110f to your computer and use it in GitHub Desktop.
Save graebnerc/8b1eeed370bd3b20be8bf8ff5c1a110f to your computer and use it in GitHub Desktop.
T8: Lecture script and exercise solutions
Contains the script used in the lecture on data wrangling and solutions to the exercises (in the fall semester 2022/23).
For a more extensive and commented version see the lecture notes on the course homepage.
The data is also available via the course homepage.
here::i_am("R/T8-Exercise-1-solution.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)
file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)
ex1_data_filtered <- ex1_data %>%
dplyr::filter(
country %in% c("Germany", "Greece"),
year %in% seq(1995, 2015),
year >= 1995, year <= 2015 # equivalent to row above
)
ex1_data_tidy <- ex1_data_filtered %>%
tidyr::pivot_wider(
names_from = "indicator",
values_from = "values")
tidy_path <- "data/tidy/ex1_solution.csv"
data.table::fwrite(x = ex1_data_tidy, file = tidy_path)
# Alternative formulation combining filtering and pivoting using the pipe:
ex1_data %>%
dplyr::filter(
country %in% c("Germany", "Greece"),
year %in% seq(1995, 2015),
year >= 1995, year <= 2015 # equivalent to row above
) %>%
tidyr::pivot_wider(
names_from = "indicator",
values_from = "values")
here::i_am("R/T8-Exercise-2-Solution.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus
# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)
# Wrangle the data---------------------
cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")
ex2_data_final <- ex2_data %>%
dplyr::select(
dplyr::all_of(cols_to_keep)
# or: -dplyr::all_of(c("unemp"))
) %>%
dplyr::mutate(share_indus=share_indus/100) %>%
dplyr::filter(year>=2010, year<=2018) %>%
pivot_longer(
cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
names_to = "indicator",
values_to = "value") %>%
dplyr::group_by(country, indicator) %>%
dplyr::summarise(
time_avg=mean(value, na.rm=TRUE),
.groups = "drop" # Not strictly necessary, but good practice
)
# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
dplyr::filter(indicator=="co2") %>%
ggplot2::ggplot(
data = .,
mapping = aes(x=indicator,
y = time_avg,
color=country,
fill=country)
) +
geom_bar(
stat = "identity",
position = position_dodge(),
alpha=0.75) +
theme_bw() +
labs(
title = "Average CO2 emissions (2010-2018)",
y = "avg. emissions per capita",
caption = "Data: World Bank.") +
scale_y_continuous(expand = expansion()) +
scale_fill_brewer(
palette = "Set1", aesthetics = c("color", "fill")) +
theme(
legend.title = element_blank(),
legend.position = "bottom",
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank()
)
ggsave(plot = ex2_plot,
filename = here("output/T8-Exercise2.pdf"),
width = 4, height = 3)
here::i_am("R/T8-SessionNotes.R")
# Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)
# This is the script used during the lecture.
# For a more extensive and commented
# version see the lecture notes on the course homepage.
# Data is available via the course homepage.
# 1. Reshaping data from long to wide------------
data_raw_long <- fread(
file = here("data/raw/wrangling_slides_long.csv"))
data_raw_long <- tibble::as_tibble(data_raw_long)
head(data_raw_long)
# 1.1. Long to wide data-------------------------
data_raw_wide <- tidyr::pivot_wider(
data = data_raw_long,
names_from = "variable",
values_from = "value")
data_raw_wide
# 1.2. Wide to long data-------------------------
data_raw_long_2 <- tidyr::pivot_longer(
data = data_raw_wide,
cols = all_of(c("unemp", "gdp", "gini")),
names_to = "new_variable",
values_to = "new_values")
data_raw_long_2
# Digression: use of tidy selection helpers:
data_raw_long_3 <- tidyr::pivot_longer(
data = data_raw_wide,
cols = starts_with("g"), # <- tidy selector
names_to = "new_variable",
values_to = "new_values")
data_raw_long_3
# More info:
# https://dplyr.tidyverse.org/reference/select.html
# 2. Chaining wrangling tasks using pipes--------
chain_1 <- tidyr::pivot_longer(
data = data_raw_wide,
cols = c("gdp", "gini","unemp"),
names_to = "indicator",
values_to = "val")
chain_2 <- tidyr::pivot_wider(
data = chain_1,
names_from = "year",
values_from = "val")
chain_complete <- pipe_data_raw %>%
tidyr::pivot_longer(
data = .,
cols = c("gdp", "gini", "unemp"),
names_to = "indicator",
values_to = "val") %>%
tidyr::pivot_wider(
data = .,
names_from = "year",
values_from = "val")
chain_complete
# Without dots:
chain_complete <- pipe_data_raw %>%
tidyr::pivot_longer(
cols = c("gdp", "gini", "unemp"),
names_to = "indicator",
values_to = "val") %>%
tidyr::pivot_wider(
names_from = "year",
values_from = "val")
chain_complete
# 3. Filtering rows------------------------------
data_raw_long_ger <- data_raw_long %>%
dplyr::filter(
country == "Germany",
country %in% c("Germany"), # equivalent to row above
variable %in% c("unemp", "gdp"),
variable == "unemp" | variable == "gdp" # equivalent to row above
)
data_raw_long_ger
# 4. Selecting columns---------------------------
data_red <- data_raw_wide %>%
dplyr::select(-all_of(c("unemp", "gini")))
# Equivalent:
data_red <- data_raw_wide %>%
dplyr::select(all_of(c("country", "year", "gini")))
# Note that you can use selection helpers as above
# 5. Creating or manipulating variables----------
data_red <- data_red %>%
dplyr::mutate(gdp_thousands = gdp/1000)
data_red
data_red_2 <- data_red %>%
dplyr::mutate(gdp = gdp/1000)
data_red_2
# 6. Grouping and summarizing data---------------
data_grouped <- data_raw_wide %>%
dplyr::group_by(country)
data_grouped
data_summarized <- data_raw_wide %>%
dplyr::group_by(country) %>%
dplyr::summarise(
unemp_mean = mean(unemp)
) %>%
dplyr::ungroup()
data_summarized
# Shorter version using .groups = "drop":
data_summarized <- data_raw_wide %>%
dplyr::group_by(country) %>%
dplyr::summarise(
unemp_mean = mean(unemp),
.groups = "drop" # to remove grouping
)
data_summarized
# Summarizing several columns using dplyr::across():
data_summarized_full <- data_raw_wide %>%
dplyr::group_by(country) %>%
dplyr::summarise(dplyr::across(
.cols = tidyr::starts_with("g"),
.fns = ~ mean(.x, na.rm=TRUE)),
.groups = "drop" # to remove grouping
)
data_summarized_full
# 7. Merging data sets---------------------------
swiid_join <- data.table::fread(
file = here("data/raw/wrangling_slides_join_gini.csv")) %>%
tibble::as_tibble(.)
swiid_join
gdp_join <- data.table::fread(
file = here("data/raw/wrangling_slides_join_gdp.csv")) %>%
tibble::as_tibble(.)
gdp_join
# left_join()
dplyr::left_join(
x = swiid_join,
y = gdp_join,
by = c("country"="country", "year"="year")
# Alternative: by = c("country", "year")
)
# right_join()
dplyr::right_join(
x = swiid_join,
y = gdp_join,
by = c("country", "year")
)
# full_join()
dplyr::full_join(
x = swiid_join,
y = gdp_join,
by = c("country", "year")
)
# inner_join()
dplyr::inner_join(
x = swiid_join,
y = gdp_join,
by = c("country", "year")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment