Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active April 23, 2023 08:49
Show Gist options
  • Save graebnerc/0bb370c67d51de50b8e1b591c2ac96c3 to your computer and use it in GitHub Desktop.
Save graebnerc/0bb370c67d51de50b8e1b591c2ac96c3 to your computer and use it in GitHub Desktop.
Scrips and exercise solutionat accompanying the videos for sessions 8 and 9 on data preparation.
Scrips and exercise solutionat accompanying the videos for sessions 8 and 9 on data preparation.
The data is available via the course homepage.
here::i_am("R/FinalApplication.R")
library(data.table)
library(here)
library(tidyr)
library(dplyr)
data_final_expl <- fread(
file = here("data/wrangling_data_final_expl.csv"))
data_final_expl <- as_tibble(data_final_expl)
# Compute the difference in the country averages of the
# variables for the time periods 2005-2007 and 2010-2013.
# Add grouping variable
solution_step1 <- data_final_expl %>%
mutate(period = ifelse(
test = year %in% 2005:2007,
yes = "Early",
no = ifelse(
test = year %in% 2010:2013,
yes = "Late",
no = "No period")))
# Filter out irrelevant years
solution_step2 <- solution_step1 %>%
filter(period != "No period")
solution_step2
# Summarize according to periods
solution_step3 <- solution_step2 %>%
summarise(
unemp_mean = mean(unemp),
gdp_mean = mean(gdp),
.by = c("country", "period"))
solution_step3
# Reshape the data to get early and late period columns
solution_step4 <- solution_step3 %>%
pivot_longer(
cols = c("unemp_mean", "gdp_mean"),
names_to = "indicator",
values_to = "means") %>%
pivot_wider(
names_from = "period",
values_from = "means")
# Compute the differences
solution_step5 <- solution_step4 %>%
mutate(difference = Late - Early)
solution_step5
# Make it more pretty
solution_step6 <- solution_step5 %>%
select(-c("Early", "Late")) %>%
pivot_wider(
names_from = "indicator",
values_from = "difference") %>%
rename(Unemployment = unemp_mean,
GDP = gdp_mean)
solution_step6
here::i_am("R/Recap-joins.R")
library(here)
library(data.table)
library(dplyr)
join_x <- fread(here("data/join_x.csv")) %>%
as_tibble(.)
join_y <- fread(here("data/join_y.csv")) %>%
as_tibble(.)
# Task 1:
dplyr::left_join(
data_x, data_y,
by=c("time", "id"))
dplyr::right_join(
data_x, data_y,
by=c("time", "id"))
dplyr::full_join(
data_x, data_y,
by=c("time", "id"))
# Task 2:
dplyr::inner_join(
data_x, data_y,
by=c("time", "id"))
# Task 2:
dplyr::full_join(
data_x, data_y,
by=c("time", "id"))
dplyr::full_join(
data_x, data_y,
by=c("id"))
here::i_am("R/Recap-manipulation.R")
library(dplyr)
library(DataScienceExercises)
wine_data_raw <- as_tibble(DataScienceExercises::wine2dine)
wine_data <- wine_data_raw %>%
dplyr::filter(kind == "white") %>%
dplyr::select(-"kind") %>%
dplyr::mutate(
quality = as.double(quality),
alcohol = alcohol / 100,
`residual sugar` = `residual sugar` / 100
)
wine_data_best <- wine_data %>%
dplyr::filter(quality == max(wine_data$quality))
# Recap on piping
here::i_am("R/Recap-pipes.R")
library(DataScienceExercises)
library(tidyr)
pipedata_v1 <- data.table::fread(here("data/recap-pipes.csv"))
pipedata_v2 <- tidyr::pivot_longer(
data = pipedata_v1,
cols = c("lifeExp", "gdpPercap"),
names_to = "Indicator",
values_to = "Value")
pipedata_v3 <- tidyr::pivot_wider(
data = pipedata_v2,
names_from = "year",
values_from = "Value")
# Piped version:
pipe_data_final <- data.table::fread(here("data/recap2.csv")) %>%
tidyr::pivot_longer(
cols = c("lifeExp", "gdpPercap"),
names_to = "Indicator",
values_to = "Value") %>%
tidyr::pivot_wider(
names_from = "year",
values_from = "Value")
here::i_am("R/Recap-reshaping.R")
library(data.table)
library(here)
library(tidyr)
# Task 1:----------
data_raw_long <- fread(file = here("data/wrangling_data_raw_long.csv"))
data_t1 <- pivot_wider(
data = data_raw_long,
names_from = "country",
values_from = "value")
data_t1
# Task 2:----------
gini_join <- fread(file = here("data/wrangling_gini_join.csv"))
data_t2 <- pivot_longer(
data = gini_join,
cols = c("gini"),
names_to = "Indicator",
values_to = "Observation")
data_t2
here::i_am("R/Recap-summarize.R")
library(dplyr)
library(DataScienceExercises)
wine_data_raw <- as_tibble(DataScienceExercises::wine2dine)
# Task 1: Summarise the data by computing the mean alcohol and mean sugar
# of white and red wines
wine_data_summary <- wine_data_raw %>%
summarise(
avg_alc = mean(alcohol),
avg_sugar = mean(`residual sugar`),
avg_quality = mean(quality),
.by = "kind"
)
# Note: you can apply operations to several columns
#. using the function across:
wine_data_raw %>%
summarise(across(
.cols = c("alcohol", "residual sugar", "quality"),
.fns = mean),
.by = "kind"
)
# For more details see:
# https://dplyr.tidyverse.org/reference/across.html
# Task 2: Compute a variable indicating how the quality of each wine deviates
# from the average quality of all wines.
wine_deviation_avg <- wine_data_raw %>%
select("quality", "kind") %>%
mutate(
avg_quality = mean(quality),
quality_deviation = quality - avg_quality)
wine_deviation_avg
here::i_am("R/VideoScript.R")
library(data.table)
library(here)
library(tidyr)
library(dplyr)
# Import example data------------------
# Data can be found in the general course material
data_raw <- fread(file = here("data/wrangling_data_raw.csv"), header = TRUE)
data_raw_long <- fread(file = here("data/wrangling_data_raw_long.csv"))
gini_join <- fread(file = here("data/wrangling_gini_join.csv"))
gdp_join <- fread(file = here("data/wrangling_gdp_join.csv"))
data_final_expl <- fread(file = here("data/wrangling_data_final_expl.csv"))
# Reshaping data ('wrangline')---------
## From wide to long-------------------
data_raw
data_long_t <- tidyr::pivot_longer(
data = data_raw,
cols = c("2017", "2018"),
names_to = "year",
values_to = "observation")
## From long to wide-------------------
data_tidy_t <- tidyr::pivot_wider(
data = data_long_t,
names_from = "indicator",
values_from = "observation")
# Pipes--------------------------------
# Vantage point:
data_raw
# Step 1:
data_long_t <- tidyr::pivot_longer(
data = data_raw,
cols = c("2017", "2018"),
names_to = "year",
values_to = "observation")
# Step 2:
data_tidy_t <- tidyr::pivot_wider(
data = data_long_t,
names_from = "indicator",
values_from = "observation")
# Piped version:
tidy_data_piped <- data_raw %>%
tidyr::pivot_longer(
cols = c("2017", "2018"),
names_to = "year",
values_to = "observation"
) %>%
tidyr::pivot_wider(
names_from = "indicator",
values_from = "observation"
)
# Data manipulation------------------------------
data_raw_long
## Filtering rows--------------------------------
data_filtered <- data_raw_long %>%
dplyr::filter(
country=="Germany",
year >= 2018)
## Selecting columns-----------------------------
cols_to_keep <- c("year", "variable", "value")
data_selected <- data_filtered %>%
dplyr::select(any_of(cols_to_keep))
## Creating and manipulating variables-----------
data_raw_2 <- data_raw %>%
dplyr::mutate(difference_abs = `2018` - `2017`) %>%
dplyr::mutate(difference_abs = difference_abs / 100)
data_raw_2 <- data_raw %>%
dplyr::mutate(
difference_abs = `2018` - `2017`,
difference_abs = difference_abs / 100
)
## Summarizing data------------------------------
data_raw_long_unemp <- data_raw_long %>%
filter(variable=="unemp")
unemp_avg <- data_raw_long_unemp %>%
summarise(unemp_mean = mean(value))
unemp_avg
unemp_avg_v <- data_raw_long_unemp %>%
mutate(
unemp_mean = mean(value),
unemp_dev = value - unemp_mean)
unemp_avg_v
## Grouped operations----------------------------
data_raw_long_unemp %>%
dplyr::group_by(country) %>%
dplyr::summarise(
unemp_avg = mean(value),
unemp_median = median(value),
.groups = "drop")
data_raw_long_unemp %>%
dplyr::group_by(country) %>%
dplyr::mutate(
unemp_avg = mean(value),
unemp_median = median(value)) %>%
dplyr::ungroup()
data_raw_long_unemp %>%
summarise(unemp_avg = mean(value),
.by = "country")
data_raw_long_unemp %>%
mutate(unemp_avg = mean(value),
.by = c("country", "year"))
# Merging data sets------------------------------
gini_join
gdp_join
gini_gdp_leftjoin <- dplyr::left_join(
x = gini_join,
y = gdp_join,
by = c("country"="Country", "year"="Year")
)
gini_gdp_rightjoin <- dplyr::right_join(
x = gini_join,
y = gdp_join,
by = c("country"="Country", "year"="Year")
)
gini_gdp_fulljoin <- dplyr::full_join(
x = gini_join,
y = gdp_join,
by = c("country"="Country", "year"="Year")
)
# inner_join()
# TODO: Add exercise: find out what inner join does as compared to the other three join function we already covered
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment