Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Created March 25, 2024 21:53
Show Gist options
  • Save graebnerc/daef5dee107ee89ca9cd618fa0dde96d to your computer and use it in GitHub Desktop.
Save graebnerc/daef5dee107ee89ca9cd618fa0dde96d to your computer and use it in GitHub Desktop.
Data Science Using R (Spring 2024) - Session 4
# Session Script on Advanced object types (Spring Semester 2024)
# Digression
ff <- factor(c("F", "M", "M"), levels = c("F", "M", "D"))
attributes(ff) # See the class attribute 'factor'
typeof(ff) # It still remains an integer type...
class(ff) # but the class was changed
# Factors--------------------
## Intro factors-------------
f_1 <- factor(c(rep("F", 4), rep("D", 5), rep("M", 3)),
levels = c("D", "F", "M"))
f_1
## Mini exercise 1-----------
# What happens if we do not specify levels explicitly?
f_ex1 <- factor(c(rep("F", 2), rep("M", 3), rep("D", 3)))
f_ex1 # They are set automatically to all elements that occur at least once
levels(f_ex1) # Alternative to return levels as character
# What happens if the vector contains elements not pre-specified as levels?
f_ex2 <- factor(c(rep("F", 2), rep("M", 3), rep("D", 3)),
levels = c("F", "M"))
f_ex2 # Elements of vector are set to NA
## Ordered factors-----------
f_2 <- factor(c("high", "high", "low"),
levels = c("low", "mid", "high"),
ordered=TRUE)
f_2
## Using table()-------------
# Gives a frequency table for a factor:
table(f_1)
table(f_ex1)
table(f_ex2)
# Sometimes, there is a category for which no elements are available (or they
# were removed). In this case, its good practice to set the levels explicitly
# such that there catories do not vanish:
f_3a <- factor(c(rep("M", 2), rep("D", 3)))
table(f_3a) # No females in the vector, but category not shown
f_3b <- factor(c(rep("M", 2), rep("D", 3)),
levels = c("D", "F", "M"))
table(f_3b) # Category now shown explicitly
# Data frames and tibbles----
## Data frames---------------
# Create from scratch
df_1 <- data.frame(
"gender" = c(rep("male", 3), rep("female", 2)),
"height" = c(189, 175, 180, 166, 150)
)
# Create from list
df_1b <- list(
"gender" = c(rep("male", 3), rep("female", 2)),
"height" = c(189, 175, 180, 166, 150)
)
df_1b
is.data.frame(df_1b) # False
df_1b <- as.data.frame(df_1b)
df_1b
is.data.frame(df_1b) # Now true
# In any case, data frames are fancy lists:
typeof(df_1)
## Tibbles-------------------
tb_1 <- tibble::as_tibble(df_1)
tb_1
## Extracting columns--------
# Whats the difference between using [ or [[
tb_1["gender"]
tb_1[["gender"]]
## Useful functions----------
dplyr::glimpse(tb_1)
head(tb_1, n = 2)
# Quick exercises - slide 15
# Create a factor with the levels "still", "medium" and "sparkling",
# and arbitrary instances of the three levels
f_ex3 <- factor(c(rep("still", 4), rep("medium", 5), rep("sparkling", 3)))
# Get the relative frequencies for “medium” of this factor
abs_freqs <- table(f_ex3)
n_elements <- length(f_ex3)
# Absolute freqs divided by total number:
round(abs_freqs / n_elements * 100, 2)
# Create a data frame with two columns, one called "nb" containing the
# numbers 1 to 5 as double, the other called "char" containing the
# numbers 6 to 10 as character
# Create columns
nb_ <- seq(1, 5)
char_ <- as.character(seq(6, 10))
# Create data frame
df_s_a <- data.frame(
"nb"=nb_,
"char"=char_
)
df_s <- data.frame(
"nb"=as.double(seq(1, 5)),
"char"=as.character(seq(6, 10))
)
# Transform this data frame into a tibble!
tb_s <- tibble::as_tibble(df_s)
tb_s
# Extract the second column of this tibble such that you have a vector
tb_s[["char"]]
tb_s["char"] # Would have given a tibble
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment