Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save graebnerc/0b49e7c05049131376eb387a907df00d to your computer and use it in GitHub Desktop.
Save graebnerc/0b49e7c05049131376eb387a907df00d to your computer and use it in GitHub Desktop.
Einführung in R (Frühjahrssemester 2024): Tag 3 - Recap
The notes made on the questions that you posed during day 3 on the content from day 2.
here::i_am("R/Day3-Questions-on-Day2.R")
library(here)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
# These are the notes taken during the third day, when we discussed questions
# on the topics from the second day.
# Using pipes--------------------------
# The data from which to start:
gdp <- DataScienceExercises::aggGDPlifexp
head(gdp)
## Comparing code with and without pipes---------
# Restructure the data WITHOUT pipes:
gdp_2 <- tidyr::pivot_longer(
data = gdp,
cols = -c("continent", "year"),
names_to = "variable",
values_to = "observation")
head(gdp_2)
gdp_3 <- tidyr::pivot_wider(
data = gdp_2,
names_from = "continent",
values_from = "observation")
head(gdp_3)
# Restructure the data WITH pipes:
gdp_final <- gdp %>%
tidyr::pivot_longer(
cols = -c("continent", "year"),
names_to = "variable",
values_to = "observation"
) %>%
tidyr::pivot_wider(
names_from = "continent",
values_from = "observation")
## Using the new pipe: |> --------------
# '|>' is largely equivalent to '%>%' and available in base R, but only in more
# recent version. See for more details:
# https://www.tidyverse.org/blog/2023/04/base-vs-magrittr-pipe/
gdp_final <- gdp |>
tidyr::pivot_longer(
cols = -c("continent", "year"),
names_to = "variable",
values_to = "observation"
) |>
tidyr::pivot_wider(
names_from = "continent",
values_from = "observation")
## When using the . with the pipe makes sense------
# The pipe throws the result of one line into the next line; if you do not
# want to use the result as input to the first argument, you can use .
# to use it for a different argument. Here is one (somehow artificial) example
# on when this could be useful, using the data from the exercises:
join_x <- fread(here("data/raw/join_x.csv"))
join_y <- fread(here("data/raw/join_y.csv"))
joined_data <- join_y %>%
filter(id=="b") %>%
left_join(
x = join_x,
y = ., # <- here comes the intermediate output passed by the pipe
by=c("time", "id"))
# Alternative without the pipe:
join_y_2 <- filter(join_y, id=="b")
joined_data <- left_join(
x = join_x,
y = join_y_2,
by=c("time", "id"))
# Selection helpers--------------------
beer_data <- DataScienceExercises::beer
head(beer_data)
# Select the columns manually:
beer_data_price <- beer_data %>%
select(c("price", "price_liquor", "price_other"))
head(beer_data_price)
# Select the columns using selection helpers:
beer_data_price <- beer_data %>%
select(starts_with("pri"))
head(beer_data_price)
# You can also combine several selection helper within c()
beer_data_price_2 <- beer_data %>%
select(c(starts_with("pri"), ends_with("ion")))
head(beer_data_price_2)
beer_data_price_2
# And selection helpers can also be used within many other functions from
# the tidyverse:
beer_data %>%
pivot_longer(
cols = contains("price"),
names_to = "price_based")
# Using selection helpers within the across() function
# You can compute the means for the three price-based columns manually:
beer_data %>%
summarize(
p=mean(price),
p_l=mean(price_liquor),
p_o=mean(price_other)
)
# Or you can use selection helpers within across():
beer_data %>%
summarise(across(.cols = contains("price"), .fns = mean))
# You can find more information on across here:
# https://dplyr.tidyverse.org/reference/across.html
# General questions--------------------
## Different column names when joining-----------
# Can I join data sets on columns that have different names in the two data
# sets that should be joined?
# Download example data:
wdi_gdp <- WDI::WDI(
country = c("DE", "AT", "IT"),
start = 2000, end = 2020,
indicator = c("GDP"="NY.GDP.PCAP.PP.KD",
"Oil_Rent"="NY.GDP.PETR.RT.ZS"))
# Create example data:
wdi_gdp_1 <- wdi_gdp %>%
select(country, year, GDP)
wdi_gdp_2 <- wdi_gdp %>%
select(country, year, Oil_Rent) %>%
rename(Country=country)
wdi_rejoined <- full_join(
x = wdi_gdp_1, y = wdi_gdp_2,
by = c("country"="Country", "year")
)
# Hint: look at the package countrycode, which is very useful when working
# with country codes:
# https://github.com/vincentarelbundock/countrycode
## Selecting many years-----------
# How do I best use many years when filtering?
# This works but is quite complex:
wdi_gdp_filtered <- wdi_gdp %>%
filter(year %in% c(2010, 2011, 2012, 2014, 2015, 2016),
country=="Italy" | country=="Germany")
# This is a more straightforward variant that does the same:
wdi_gdp_filtered <- wdi_gdp %>%
filter(year %in% c(2010:2012, 2014:2016),
country %in% c("Italy", "Germany"))
# Some examples on how the %in% command works:
let <- letters[1:4]
let
"A" %in% let # Is "A" in the vector let?
"d" %in% let # Is "d" in the vector let?
# Related are the set operators:
set_1 <- 1:4
set_2 <- 3:6
union(set_1, set_2) # All elements that are at least in one of the two sets
intersect(set_1, set_2) # Elements that are in both sets
setdiff(set_1, set_2) # Elements that are in set_1, but not set_2
setdiff(set_2, set_1) # Order matters: elements that are in set_2, but not set_1
setequal(set_1, set_2) # Do the two sets contain the same elements?
setequal(set_1, 1:4) # It is about the same elements, not whether it is the same set!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment