Skip to content

Instantly share code, notes, and snippets.

@dantalus
Created June 8, 2017 09:34
Show Gist options
  • Save dantalus/cfbfeb35e4c25b292b41ec401e063937 to your computer and use it in GitHub Desktop.
Save dantalus/cfbfeb35e4c25b292b41ec401e063937 to your computer and use it in GitHub Desktop.
install.packages("tidyverse")
library(tidyverse)
# Objects ####
# Most of R, from an applied point of view anyway, is the process of creating
# objects and feeding them into functions to make amazing, new objects.
# amazing_new_object <- f(object)
x <- c(3, 4, 5)
y <- mean(x)
# This is true in the big picture sense as well
# information
# dataframe <- f(information)
# plot <- f(dataframe)
# model <- f(dataframe)
# table <- f(model)
# report <- f(plot, table)
# But before we get to that point...
# The first objects we typically work with can be described as data structures,
# and these can hold different types of data:
typeof(1) # double
typeof("Yes") # character
typeof(FALSE) # logical
# Missing values are represented by NA
c(1, 2, NA, 4)
# Types of data structures:
# Vectors - a one dimensional set of values that must all be of the same type.
c(1, 2, 3, 3.277465) # All numbers
c("Yes", "No", "1") # All characters
c(FALSE, TRUE) # All logical
c(F, T) # All logical (Warning - spell them out)
# Matrices/Arrays - two or more dimensions
m <- matrix(1:9, 3, 3)
dim(m)
m
# Vectors, matrices, and arrays can only contain 1 kind of data. This is
# important to understand.
# What happens when we create a vector with multiple types of data?
v <- c(1, TRUE, "Yes")
typeof(v)
v
# You can see that the 1 and TRUE were converted to "1" and "TRUE"
# Lists are a special type of vector that allow us to combine different types of
# data.
v <- list(1, TRUE, "Yes")
typeof(v[[1]]) # double
typeof(v[[2]]) # logical
typeof(v[[3]]) # character
typeof(v) # list
class(v)
class(1)
typeof(c(1, 2.2))
class(c(1, 2.2))
typeof(c(1, 2))
class(c(1, 2))
typeof(1:3)
typeof(c(1:3))
typeof(c(1, 2, 3))
# Dataframes are in turn special types of lists that correspond to the concept
# of a dataset (a rectangular matirx of values, with observations in rows and
# variables in columns, perhaps with some labels or other metadata attached)
data <- data.frame(number = rnorm(50),
char = sample(letters, 50, replace = TRUE),
logic = sample(c(TRUE, FALSE), 50, replace = TRUE))
View(data)
save(data, v, m, x, y, file = "data.RData") # Save an object of set of objects
rm(list=ls()) # Remove all the objects in the environment
load("data.RData") # Bring those objects back
# There are many functions to help us better understand objects
mode(data)
typeof(data)
class(data)
class(data) <- c(class(data), "bob") # We can assign classes
str(data) # Structure
reg <- lm(number ~ char, data = data) # A nonsense regression model
reg
summary(reg)
str(reg)
typeof(reg)
class(reg)
View(data)
utils::View(data) # When you want an un-constricted view
names(data)
attributes(data)
dim(data)
length(data)
nrow(data)
ncol(data)
is.character("x")
is.numeric(1)
x <- factor(c(1, 2))
is.factor(x)
is.logical(FALSE)
is.na(c(1, 2, NA, 4))
# ! will reverse logical values
!is.na(c(1, 2, NA, 4))
# Subsetting ####
# Part of working with R is being table to take apart objects and rearrange the
# parts.
# Indexing
# One-dimension
vec <- sample(c(0:9), 100, replace = TRUE)
vec[2]
# Two dimensions
mat <- matrix(c(1, 2, 3, 3, 2, 1), ncol = 2)
matrix[1, 2]
# Lists
x <- list(a = c(1, 2), b = c(4, 4), c = c(6, 8), d = c(9, 11))
x[[1]]
x[[1]][1]
x[1]
# These give different results
class(x[[1]])
str(x[[1]])
class(x[1])
str(x[1])
attributes(x[1])
# $ for named elements in a list
x$a
class(x$a)
x$a[1]
# Selecting multiple elements
x <- letters
x[c(1, 2, 6)]
x <- sample(c(0:9), 100, replace = TRUE)
x[x < 5]
x < 5
d <- data_frame(number = sample(0:9, 100, replace = TRUE),
character = rep(c("a", "b"), 50))
lapply(d, class)
d[unlist(lapply(d, is.numeric))] %>% head()
d[ sapply(d, is.numeric)] %>% head()
# You can name elements in data structures besides lists.
x <- c( 1, 4, 6, 9)
str(x)
x <- c(a = 1, b = 4, c = 6, d = 9)
str(x)
names(x)
attributes(x)
attr(x, "description") <- "This is a named vector"
attributes(x)
# But $ only works with lists
x$a
x <- list(a = 1, b = 4, c = 6, d = 9)
x$a
# Making and combining objects ####
?c
?matrix
?array
?list
seq_along(c(1:20))
seq_along(c(100:120))
seq(from = 0, to = 100, by = 10)
seq(0, 100, 10)
rep(c(1, 2), times = 100)
rep(c(1, 2), each = 100)
# Combining different data types can be tricky
# Differnt data types willl typically reduce to the type with the lowest level
# of information
x <- c(1, "character")
x
class(x)
x <- c(1, TRUE, "character")
x
class(x)
x <- c(1, TRUE, FALSE)
x
class(x)
# No problem with a list
x <- list(1, TRUE, "character")
x
class(x)
# Vectors can be combined to make matrices, but be careful
# R will extend a shorter vector to match a longer one, thus creating data you
# you might not expect.
?rbind
length(rbind(sample(0:9, 100, replace = TRUE)))
length(c("a", "b"))
m <- rbind(sample(0:9, 100, replace = TRUE),
c("a", "b"))
class(m)
View(m)
# Dataframes will prevent you from doing this
m <- rbind(sample(0:9, 100, replace = TRUE),
c("a", "b")) %>% as.data.frame() # Not this way
m <- data_frame(sample(0:9, 100, replace = TRUE),
c("a", "b")) # Error, which is correct
# We can also combine by columns
cbind?
m <- cbind(sample(0:9, 100, replace = TRUE),
sample(letters, 100, replace = TRUE))
class(m)
View(m)
# There are other functions to help switch between information types
as.character(1)
as.numeric("1")
as.numeric("dog")
as.factor(1)
# We can put strings together with paste.
paste(letters, "hello", sep = "_")
paste(letters, letters, sep = "_")
paste(letters, c("yes", "no"), sep = "_")
paste0(letters, "hello")
# We can sample and simulate data
sample(letters, size = 100, replace = TRUE)
sample(letters, size = 10, replace = FALSE)
rnorm(10, mean = 0, sd = 1) %>% qplot()
rnorm(10000, 0, 1) %>% qplot()
d <- data_frame(A = sample(c(0:9), size = 100, replace = TRUE),
B = sample(c(0:9), size = 100, replace = TRUE),
C = sample(c(0:9), size = 100, replace = TRUE),
D = sample(c(0:9), size = 100, replace = TRUE))
d$total <- d$A + d$B + d$C + d$D
d$total2 <- rowSums(d[c(1:4)])
d$mean <- d$total / 4
d$mean2 <- rowMeans(d[c(1:4)])
ggplot(d, aes(x = mean, y = mean2)) + geom_point()
x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
x[duplicated(x)]
x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
x[unique(x)]
length(x) - length(unique(x))
# Factors ####
# Factors are a special kind of numeric variable with labels attached to each
# value, signifying categorical (nominal, ordered) data.
f <- sample(c("Yes", "No", "Maybe"), size = 100, replace = TRUE,
prob = c(0.3, 0.6, 0.1))
f.1 <- factor(f)
# The "levels" are the labels
levels(f.1)
table(f.1)
# Confirm the structure
str(f.1)
# The underlying numbers:
table(as.numeric(f.1))
# The order of the levels matters. By default, they will be in alphabetial order
sample(letters[c(1, 5, 8)], size = 100, replace = TRUE) %>%
factor() %>%
levels()
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
factor() %>%
class()
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
factor(levels = c("e", "h", "a"))
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
factor(levels = c("e", "h", "a"), ordered = TRUE)
sample(c(1, 2, 10, 20, 100), size = 100, replace = TRUE) %>%
factor()
sample(as.character(c(1, 2, 10, 20, 100)), size = 100, replace = TRUE) %>%
factor()
# Reordering levels
table(f.1)
levels(f.1) <- c("Yes", "No", "Maybe")
table(f.1) # Bad!
levels(f.1) <- rev(levels(f.1)) # Switch it back
table(f.1)
# Do it with factor()
f.1 <- factor(f, levels = rev(levels(f.1)))
table(f.1) # Correct
f.1 <- relevel(f.1, ref = "Maybe")
table(f.1)
table(as.numeric(f.1)) # Convert to the underlying number
# Do it manually
f.1 <- factor(f, levels = c("Maybe", "Yes", "No"))
table(f.1) # Correct
# You need to use the exising levels
f.1 <- factor(f, levels = c("A", "B", "C")) # Bad
f.1 <- factor(f)
f.2 <- factor(f, labels = c("A", "B", "C")) # Use the labels option
table(f.1, f.2)
levels(f.2) # The labels become the levels forevermore
# Numbers as factors
f <- sample(c(10, 20, 50, 60, 65, 90), size = 100, replace = TRUE)
f.1 <- factor(f)
levels(f.1)
str(f.1)
f.1 %>% as.numeric() %>% table() # No
as.numeric(levels(f.1)[f.1]) %>% table() # Yes
f.1 <- cut(f, 4) # Equally spaced levels
table(f.1)
str(f.1)
f.1 <- cut(f, 4, labels = c("Low", "Med", "High", "Very High"))
table(f.1)
levels(f.1)
as.character(f.1)
# ~ equally sized levels
f.1 <- cut(f, breaks = quantile(f, 0:4/4))
table(f.1)
levels(f.1)
# User defined cuts
bmi <- rnorm(100, 2,6, 4)
qplot(bmi)
bmi <- cut(bmi, c(0, 18.5, 25, 30, max(bmi)),
labels = c("UW", "NW", "OW", "OB"))
table(bmi)
# Reordering levels based on other values
data <- data_frame(number = rnorm(100, 0, 1),
factor = factor(sample(letters[1:5], 100, replace = TRUE)))
levels(data$factor)
data <- group_by(data, factor) %>%
summarise(mean = mean(number)) %>%
full_join(data, by = "factor")
table(data$factor, data$mean)
ggplot(data, aes(x = factor, fill = mean)) +
geom_bar()
data$factor <- reorder(data$factor, data$mean)
levels(data$factor)
ggplot(data, aes(x = factor, fill = mean)) +
geom_bar()
data <- group_by(data, factor) %>%
summarise(count = n()) %>%
full_join(data, by = "factor")
data$factor <- reorder(data$factor, data$count)
ggplot(data, aes(x = factor, fill = mean)) +
geom_bar()
# Describing data ####
bmi <- rnorm(100, 26, 4)
mean(bmi)
min(bmi)
max(bmi)
quantile(bmi, 0.50, na.rm = TRUE)
quantile(bmi, seq(0.05, 0.95, by = 0.05), na.rm = TRUE)
bmi[bmi > 30 & bmi < 32 & !is.na(bmi)] <- NA
!is.na(bmi)
bmi[is.na(bmi)] %>% length()
mean(bmi)
mean(bmi, na.rm = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment