Created
July 15, 2024 12:27
-
-
Save graebnerc/0b49e7c05049131376eb387a907df00d to your computer and use it in GitHub Desktop.
Einführung in R (Frühjahrssemester 2024): Tag 3 - Recap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The notes made on the questions that you posed during day 3 on the content from day 2. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/Day3-Questions-on-Day2.R") | |
library(here) | |
library(magrittr) | |
library(dplyr) | |
library(tidyr) | |
library(data.table) | |
# These are the notes taken during the third day, when we discussed questions | |
# on the topics from the second day. | |
# Using pipes-------------------------- | |
# The data from which to start: | |
gdp <- DataScienceExercises::aggGDPlifexp | |
head(gdp) | |
## Comparing code with and without pipes--------- | |
# Restructure the data WITHOUT pipes: | |
gdp_2 <- tidyr::pivot_longer( | |
data = gdp, | |
cols = -c("continent", "year"), | |
names_to = "variable", | |
values_to = "observation") | |
head(gdp_2) | |
gdp_3 <- tidyr::pivot_wider( | |
data = gdp_2, | |
names_from = "continent", | |
values_from = "observation") | |
head(gdp_3) | |
# Restructure the data WITH pipes: | |
gdp_final <- gdp %>% | |
tidyr::pivot_longer( | |
cols = -c("continent", "year"), | |
names_to = "variable", | |
values_to = "observation" | |
) %>% | |
tidyr::pivot_wider( | |
names_from = "continent", | |
values_from = "observation") | |
## Using the new pipe: |> -------------- | |
# '|>' is largely equivalent to '%>%' and available in base R, but only in more | |
# recent version. See for more details: | |
# https://www.tidyverse.org/blog/2023/04/base-vs-magrittr-pipe/ | |
gdp_final <- gdp |> | |
tidyr::pivot_longer( | |
cols = -c("continent", "year"), | |
names_to = "variable", | |
values_to = "observation" | |
) |> | |
tidyr::pivot_wider( | |
names_from = "continent", | |
values_from = "observation") | |
## When using the . with the pipe makes sense------ | |
# The pipe throws the result of one line into the next line; if you do not | |
# want to use the result as input to the first argument, you can use . | |
# to use it for a different argument. Here is one (somehow artificial) example | |
# on when this could be useful, using the data from the exercises: | |
join_x <- fread(here("data/raw/join_x.csv")) | |
join_y <- fread(here("data/raw/join_y.csv")) | |
joined_data <- join_y %>% | |
filter(id=="b") %>% | |
left_join( | |
x = join_x, | |
y = ., # <- here comes the intermediate output passed by the pipe | |
by=c("time", "id")) | |
# Alternative without the pipe: | |
join_y_2 <- filter(join_y, id=="b") | |
joined_data <- left_join( | |
x = join_x, | |
y = join_y_2, | |
by=c("time", "id")) | |
# Selection helpers-------------------- | |
beer_data <- DataScienceExercises::beer | |
head(beer_data) | |
# Select the columns manually: | |
beer_data_price <- beer_data %>% | |
select(c("price", "price_liquor", "price_other")) | |
head(beer_data_price) | |
# Select the columns using selection helpers: | |
beer_data_price <- beer_data %>% | |
select(starts_with("pri")) | |
head(beer_data_price) | |
# You can also combine several selection helper within c() | |
beer_data_price_2 <- beer_data %>% | |
select(c(starts_with("pri"), ends_with("ion"))) | |
head(beer_data_price_2) | |
beer_data_price_2 | |
# And selection helpers can also be used within many other functions from | |
# the tidyverse: | |
beer_data %>% | |
pivot_longer( | |
cols = contains("price"), | |
names_to = "price_based") | |
# Using selection helpers within the across() function | |
# You can compute the means for the three price-based columns manually: | |
beer_data %>% | |
summarize( | |
p=mean(price), | |
p_l=mean(price_liquor), | |
p_o=mean(price_other) | |
) | |
# Or you can use selection helpers within across(): | |
beer_data %>% | |
summarise(across(.cols = contains("price"), .fns = mean)) | |
# You can find more information on across here: | |
# https://dplyr.tidyverse.org/reference/across.html | |
# General questions-------------------- | |
## Different column names when joining----------- | |
# Can I join data sets on columns that have different names in the two data | |
# sets that should be joined? | |
# Download example data: | |
wdi_gdp <- WDI::WDI( | |
country = c("DE", "AT", "IT"), | |
start = 2000, end = 2020, | |
indicator = c("GDP"="NY.GDP.PCAP.PP.KD", | |
"Oil_Rent"="NY.GDP.PETR.RT.ZS")) | |
# Create example data: | |
wdi_gdp_1 <- wdi_gdp %>% | |
select(country, year, GDP) | |
wdi_gdp_2 <- wdi_gdp %>% | |
select(country, year, Oil_Rent) %>% | |
rename(Country=country) | |
wdi_rejoined <- full_join( | |
x = wdi_gdp_1, y = wdi_gdp_2, | |
by = c("country"="Country", "year") | |
) | |
# Hint: look at the package countrycode, which is very useful when working | |
# with country codes: | |
# https://github.com/vincentarelbundock/countrycode | |
## Selecting many years----------- | |
# How do I best use many years when filtering? | |
# This works but is quite complex: | |
wdi_gdp_filtered <- wdi_gdp %>% | |
filter(year %in% c(2010, 2011, 2012, 2014, 2015, 2016), | |
country=="Italy" | country=="Germany") | |
# This is a more straightforward variant that does the same: | |
wdi_gdp_filtered <- wdi_gdp %>% | |
filter(year %in% c(2010:2012, 2014:2016), | |
country %in% c("Italy", "Germany")) | |
# Some examples on how the %in% command works: | |
let <- letters[1:4] | |
let | |
"A" %in% let # Is "A" in the vector let? | |
"d" %in% let # Is "d" in the vector let? | |
# Related are the set operators: | |
set_1 <- 1:4 | |
set_2 <- 3:6 | |
union(set_1, set_2) # All elements that are at least in one of the two sets | |
intersect(set_1, set_2) # Elements that are in both sets | |
setdiff(set_1, set_2) # Elements that are in set_1, but not set_2 | |
setdiff(set_2, set_1) # Order matters: elements that are in set_2, but not set_1 | |
setequal(set_1, set_2) # Do the two sets contain the same elements? | |
setequal(set_1, 1:4) # It is about the same elements, not whether it is the same set! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment