Created
March 25, 2024 21:53
-
-
Save graebnerc/daef5dee107ee89ca9cd618fa0dde96d to your computer and use it in GitHub Desktop.
Data Science Using R (Spring 2024) - Session 4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Session Script on Advanced object types (Spring Semester 2024) | |
# Digression | |
ff <- factor(c("F", "M", "M"), levels = c("F", "M", "D")) | |
attributes(ff) # See the class attribute 'factor' | |
typeof(ff) # It still remains an integer type... | |
class(ff) # but the class was changed | |
# Factors-------------------- | |
## Intro factors------------- | |
f_1 <- factor(c(rep("F", 4), rep("D", 5), rep("M", 3)), | |
levels = c("D", "F", "M")) | |
f_1 | |
## Mini exercise 1----------- | |
# What happens if we do not specify levels explicitly? | |
f_ex1 <- factor(c(rep("F", 2), rep("M", 3), rep("D", 3))) | |
f_ex1 # They are set automatically to all elements that occur at least once | |
levels(f_ex1) # Alternative to return levels as character | |
# What happens if the vector contains elements not pre-specified as levels? | |
f_ex2 <- factor(c(rep("F", 2), rep("M", 3), rep("D", 3)), | |
levels = c("F", "M")) | |
f_ex2 # Elements of vector are set to NA | |
## Ordered factors----------- | |
f_2 <- factor(c("high", "high", "low"), | |
levels = c("low", "mid", "high"), | |
ordered=TRUE) | |
f_2 | |
## Using table()------------- | |
# Gives a frequency table for a factor: | |
table(f_1) | |
table(f_ex1) | |
table(f_ex2) | |
# Sometimes, there is a category for which no elements are available (or they | |
# were removed). In this case, its good practice to set the levels explicitly | |
# such that there catories do not vanish: | |
f_3a <- factor(c(rep("M", 2), rep("D", 3))) | |
table(f_3a) # No females in the vector, but category not shown | |
f_3b <- factor(c(rep("M", 2), rep("D", 3)), | |
levels = c("D", "F", "M")) | |
table(f_3b) # Category now shown explicitly | |
# Data frames and tibbles---- | |
## Data frames--------------- | |
# Create from scratch | |
df_1 <- data.frame( | |
"gender" = c(rep("male", 3), rep("female", 2)), | |
"height" = c(189, 175, 180, 166, 150) | |
) | |
# Create from list | |
df_1b <- list( | |
"gender" = c(rep("male", 3), rep("female", 2)), | |
"height" = c(189, 175, 180, 166, 150) | |
) | |
df_1b | |
is.data.frame(df_1b) # False | |
df_1b <- as.data.frame(df_1b) | |
df_1b | |
is.data.frame(df_1b) # Now true | |
# In any case, data frames are fancy lists: | |
typeof(df_1) | |
## Tibbles------------------- | |
tb_1 <- tibble::as_tibble(df_1) | |
tb_1 | |
## Extracting columns-------- | |
# Whats the difference between using [ or [[ | |
tb_1["gender"] | |
tb_1[["gender"]] | |
## Useful functions---------- | |
dplyr::glimpse(tb_1) | |
head(tb_1, n = 2) | |
# Quick exercises - slide 15 | |
# Create a factor with the levels "still", "medium" and "sparkling", | |
# and arbitrary instances of the three levels | |
f_ex3 <- factor(c(rep("still", 4), rep("medium", 5), rep("sparkling", 3))) | |
# Get the relative frequencies for “medium” of this factor | |
abs_freqs <- table(f_ex3) | |
n_elements <- length(f_ex3) | |
# Absolute freqs divided by total number: | |
round(abs_freqs / n_elements * 100, 2) | |
# Create a data frame with two columns, one called "nb" containing the | |
# numbers 1 to 5 as double, the other called "char" containing the | |
# numbers 6 to 10 as character | |
# Create columns | |
nb_ <- seq(1, 5) | |
char_ <- as.character(seq(6, 10)) | |
# Create data frame | |
df_s_a <- data.frame( | |
"nb"=nb_, | |
"char"=char_ | |
) | |
df_s <- data.frame( | |
"nb"=as.double(seq(1, 5)), | |
"char"=as.character(seq(6, 10)) | |
) | |
# Transform this data frame into a tibble! | |
tb_s <- tibble::as_tibble(df_s) | |
tb_s | |
# Extract the second column of this tibble such that you have a vector | |
tb_s[["char"]] | |
tb_s["char"] # Would have given a tibble |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment