Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active May 5, 2022 09:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save graebnerc/7a054a52cd5799f0adf66c269b040b46 to your computer and use it in GitHub Desktop.
Save graebnerc/7a054a52cd5799f0adf66c269b040b46 to your computer and use it in GitHub Desktop.
T8: Lecture notes
Lecture notes and solutions to the exercises of session 8 on data wrangling
here::i_am("R/exercise_1_solution.R") # Adjust to your folder structure
library(here)
library(dplyr)
library(tidyr)
library(tibble)
library(data.table)
file_path <- here::here("data/raw/exercise_1.csv")
ex1_data <- data.table::fread(file = file_path)
ex1_data <- tibble::as_tibble(ex1_data)
ex1_data_filtered <- ex1_data %>%
dplyr::filter(
country %in% c("Germany", "Greece"),
year %in% seq(1995, 2015),
year >= 1995, year <= 2015 # equivalent to row above
)
ex1_data_filtered_long <- ex1_data_filtered %>%
tidyr::pivot_longer(
cols = c("gdp", "co2"),
names_to = "indicator",
values_to = "values")
here::i_am("R/exercise_2_solution.R")
library(here)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2) # only for the bonus
# Import the data----------------------
file_path <- here::here("data/raw/exercise_2.csv")
ex2_data <- data.table::fread(file = file_path)
ex2_data <- tibble::as_tibble(ex2_data)
# Wrangle the data---------------------
ex2_data_final <- ex2_data %>%
dplyr::select(
dplyr::all_of(c("country", "year", "gdp", "share_indus", "co2"))
# or: -dplyr::all_of(c("unemp"))
) %>%
dplyr::mutate(share_indus=share_indus/100) %>%
dplyr::filter(year>=2010, year<=2018) %>%
pivot_longer(
cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
names_to = "indicator",
values_to = "value") %>%
dplyr::group_by(country, indicator) %>%
dplyr::summarise(
time_avg=mean(value, na.rm=TRUE),
.groups = "drop" # Not strictly necessary, but good practice
)
# Bonus: make a plot from the data-----
ex2_plot <- ex2_data_final %>%
dplyr::filter(indicator=="co2") %>%
ggplot2::ggplot(
data = .,
mapping = aes(x=indicator,
y = time_avg,
color=country,
fill=country)
) +
geom_bar(
stat = "identity",
position = position_dodge(),
alpha=0.75) +
theme_bw() +
labs(
title = "Average CO2 emissions (2010-2018)",
y = "avg. emissions per capita",
caption = "Data: World Bank.") +
scale_y_continuous(expand = expansion()) +
scale_fill_brewer(
palette = "Set1", aesthetics = c("color", "fill")) +
theme(
legend.title = element_blank(),
legend.position = "bottom",
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank()
)
ggsave(plot = ex2_plot,
filename = here("output/T8-Exercise2.pdf"),
width = 4, height = 3)
here::i_am("R/T8-Session-Notes.R") # Adjust to your directory structure
library(here)
library(dplyr)
library(tidyr)
library(data.table)
# This is the script used during the lecture. For a more extensive and commented
# version see the lecture notes on the course homepage
# Read in data (all available via the course homepage)
gini_red <- fread(
file = here("data/raw/wrangling_slides_gini.csv"))
data_raw <- fread(
file = here("data/raw/wrangling_slides.csv"))
data_raw_long <- fread(
file = here("data/raw/wrangling_slides_long.csv"))
data_final_expl <- fread(
file = here("data/raw/wrangling_slides_final_expl.csv"))
swiid_join <- fread(
file = here("data/raw/wrangling_slides_gini_grc.csv"))
# 1. Reshaping data from long to wide
# 1.1. Long to wide data
data_raw_long
data_raw_wide <- tidyr::pivot_wider(
data = data_raw_long,
names_from = "variable",
values_from = "value")
data_raw_wide
# 1.2. Wide to long data
data_raw_long_2 <- tidyr::pivot_longer(
data = data_raw_wide,
cols = all_of(c("unemp", "gdp", "gini")),
names_to = "new_variable",
values_to = "new_values")
data_raw_long_2
data_raw_long_3 <- tidyr::pivot_longer(
data = data_raw_wide,
cols = starts_with("g"),
names_to = "new_variable",
values_to = "new_values")
data_raw_long_3
# 2. Chaining wrangling tasks using pipes
chain_1 <- tidyr::pivot_longer(
data = pipe_data_raw,
cols = c("gdp", "unemp"),
names_to = "indicator",
values_to = "val")
chain_2 <- tidyr::pivot_wider(
data = chain_1,
names_from = "year",
values_from = "val")
chain_complete <- pipe_data_raw %>%
tidyr::pivot_longer(
data = .,
cols = c("gdp", "unemp"),
names_to = "indicator",
values_to = "val") %>%
tidyr::pivot_wider(
data = .,
names_from = "year",
values_from = "val")
chain_complete
# Without dots:
chain_complete <- pipe_data_raw %>%
tidyr::pivot_longer(
cols = c("gdp", "unemp"),
names_to = "indicator",
values_to = "val") %>%
tidyr::pivot_wider(
names_from = "year",
values_from = "val")
chain_complete
# 3. Filtering rows
data_raw_long_ger <- data_raw_long %>%
dplyr::filter(
country == "Germany",
country %in% c("Germany"), # equivalent to row above
variable %in% c("unemp", "gdp"),
variable == "unemp" | variable == "gdp" # equivalent to row above
)
data_raw_long_ger
# Exercise 1
# See solution online:
# https://gist.github.com/graebnerc/f635b4e3cfcbcc01b4511f26d0561251
# 4. Creating of manipulating variables
# TBA
# 5. Selecting columns
# TBA
# 6. Grouping and summarising data
# TBA
# Exercise 2
# TBA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment