Skip to content

Instantly share code, notes, and snippets.

@bryangoodrich
Created April 19, 2016 19:00
Show Gist options
  • Save bryangoodrich/0e03efbf7eba66e9e0d0947c441bb681 to your computer and use it in GitHub Desktop.
Save bryangoodrich/0e03efbf7eba66e9e0d0947c441bb681 to your computer and use it in GitHub Desktop.
Adventures in R Training Code
#########################################
# Lesson 1 - R Basics
#
# Learning Objectives
# 1. R Data Types
# 2. Indexing
# 3. Boolean Logic and Filtering
# 4. Importing/Exporting
# 5. The R Environment
#########################################
# The Vector ----------------------------
c(1, 2, 3) # Combines elements into a Vector
seq(1, 3) # Sequence function
seq(3) # 1 parameter version
1:3 # sequence shortcut
1-2:3 # Unexpected consequences
(1-2):3 # Order of Operations
seq(1-2, 3) # May be easier to use function
sum(1:10000) # Vectorized functions
s # Error "object 's' not found
sum(s) # Ditto
s <- "sentence" # A string
s
s <- strsplit(s, "")[[1]] # Split it apart
s
class(s) # Type of Object
mode(s) # Type of Storage Mode
length(s) # How big is this Vector
sum(s) # Errors when incompatible types (class)
##########################################
# Knowledge Check 1
# 1. Make a vector.
# Yes, that's it. Make a vector.
# Any vector. Assign it to a varaible.
# Look at its class, mode, and length
##########################################
# Named Vectors and Indexing ---------------------------------------------
c("Al" = 30, "Beth" = 24, "Charlie" = 40)
names(s) # No Name Attribute
NULL
NA
c(1, NA, 3) # NA is Missing
c(1, NULL, 3) # NULL is VOID
1:length(s) # Numeric Integer Sequence
1:(length(s) - 2) # Data-driven design
LETTERS # Constant - Uppercase Letters
letters # Constant - Lowercase Letters
TRUE # Constant - Boolean True
F # Constant - Abbreviated FALSE
pi # Constant - PI
month.name # Constant - Month Names
month.abb # Constant - Month Abbreviations
month.abb[3] # Numeric Point Indexing
month.abb[1:3] # Range Index
month.abb[c(1, 6, 12)] # Discontinuous Range Index
month.abb[-1] # Negative Index
names(s) <- month.abb[1:length(s)]
s
names(s)
s["Jan"] # Named Indexing
s[c("Jan", "Not Here")]
s[100]
########################################################
# Knowledge Check 2
# 1. Create a vector representing your name
# [1] "b" "r" "y" "a" "n"
# 2. Index this vector to return the *last* 3 elements
# [1] "y" "a" "n"
########################################################
# Dimensional Vectors - The Matrix ----------------------------------------
s <- sample(1:100, size = 8)
s
dim(s)
dim(s) <- c(4, 2)
s
class(s)
mode(s)
length(s)
nrow(s)
ncol(s)
names(s) <- c("X", "Y")
s # WHAT HAPPENED?!
attributes(s)
names(s)
attr(s, "names")
names(s) <- NULL
s
dimnames(s)
colnames(s) <- c("X", "Y")
s
dimnames(s)
rownames(s) <- 1:nrow(s)
s
dimnames(s)
names(dimnames(s))
names(dimnames(s)) <- c("A", "B")
dimnames(s)
s
######################################################
# Knowledge Check 3 (Homework)
# 1. Create a random 20-element numeric vector
# 2. Coerce it into a 5x4 matrix
# 3. Label the rows Y2001 through Y2005
# 4. Label the columns Q1 through Q4
# 5. Name the dimnames Year and Quarter, respectively
######################################################
# Dimensional Indexing ----------------------------------------------------
s[1, 1] # Matrix Point index
s[, 1] # Column Index - Returns Vector (lower dim)
s[, 1, drop = FALSE] # Maintain Object Structure
s[2:3, ] # Row Ranged Index
s[, "Y"] # Named Column Index
s["4", ] # Named Row Index
s[4, ,drop = FALSE] # Row Index with Structure
s[, -2, drop = FALSE] # Negative Index
s[1, 1] <- 999
s
s[length(s)] <- 222
s
#######################################
# Knowledge Check 4
# 1. Use negative indexing to print the
# matrix without the first 2 rows.
# 2. Replace the bottom row (using nrow)
# to assign new values to each column
#######################################
# Too-Many-Dimensions Vectors - Arrays -----------------------------------
s = 1:30
class(s)
mode(s)
dim(s) <- c(5, 2, 3)
s
class(s)
mode(s)
length(s)
s[20] <- 100 # Vector Point Index Assignment
s
s[5, 2, 2] # Array Point Index
s[5, 2, 1:2] # Mixed Indexing
s[5, 2, 1:2, drop = FALSE] # Keep Structure
nrow(s) # Same as dim(s)[1]
ncol(s) # Same as dim(s)[2]
dim(s)[3] # No more helpers
attr(s, "dim")[3] # For the hardcore programmer
dimnames(s)
dimnames(s) <- list(
"Rows" = 1:5,
"Fields" = sample(LETTERS, 2),
"Group" = c("Ones", "Tens", "Twenties"))
s
s[, , "Ones"] # Named Index
s[2:3, 2, c(1, 3)] # Mixed Index
class(s[2:3, 2, c(1, 3)])
s[2:3, 2, c(1, 3), drop = FALSE]
##############################################
# Knowedge Check 5 (Homework)
# 1. Read the ?matrix help documentation
# (Recommend also ?vector and ?array)
# 2. Explore creating matrices from a vector
# setting the byrow parameter both to
# TRUE and then to FALSE.
# 3. Execute x <- rnorm(20)^2 * 100 to
# represent a random time series data set
# of quarterly product earnings over 2001
# through 2005. Use matrix(x, ...) to
# create a 5x4 matrix representing the
# years per row and quarter per column
# 4. Manipulate the dimnames attribute
# appropriately to give context
##############################################
# Lists and Data Frames ------------------------------------
d = dimnames(s)
d
class(d)
mode(d)
length(d)
d[1] # List Point Index (Returns List)
d["Rows"] # Named List Point Index
d$Rows # Named Accessor (Access List Data)
d[[1]] # Index Accessor
d[c(1, 3)] # Returns List
d[[c(1, 3)]] # DON'T DO THIS (unless you know what you're doing)
x <- list("A" = 1:6, "B" = rnorm(6), "C" = gl(2, 3))
y <- data.frame(A = 1:6, B = rnorm(6), C = gl(2, 3))
x
y
x$C
y$C
class(x) # List
class(y) # Data Frame
mode(x) # List
mode(y) # List!!
as.data.frame(x) # DF = "Named List with Equal Length Elements"
print.data.frame # Class Dispatching (method).(class)
attributes(x) # Simple Object
attributes(y) # Complex Object - No Dimensions!
dim(y) # But it has dimensions
dim.data.frame # Special dim function for data frames
.row_names_info # Hidden function
.row_names_info(y, 2L) # data frame row count
nrow(y) # Uses dim function; gets dispatched!
length(y)
dim(x) <- dim(y) # Can you? ...
x[1] # List Index
y[1] # Column Index
x[[1]] # List Accessor
y[[1]] # Column Accessor
y[, 1] # Dimensional Index IS Accessor
x[, 1] # Nonsense!
y[1:3, 1]
y[1:3, c("A", "B")]
y[1:3, 1, drop = FALSE]
y[, 1][1:3]
############################################
# Knowledge Check 6
# Make a Data Frame. Make a List. Go crazy.
# Any Questions?
############################################
# Conditional Indexing and Filtering (Subsetting) -------------------------
x <- ChickWeight
head(ChickWeight)
!TRUE
TRUE & F
any(c(T, T, F))
all(c(T, T, F))
subset(x, Chick == 1)
x$Chick == 1
which(x$Chick == 1)
x[x$Chick == 1, 'weight', drop = FALSE]
subset(x, Chick == 1, select = weight)
subset(x, Chick == 1, select = weight, drop = TRUE)
# Import and Exporting Data -----------------------------------------------
library(help = "datasets")
ls()
rm(list = ls())
ls()
data(mtcars) # Bring Package data sets to environment
ls()
class(mtcars)
dimnames(mtcars)
str(mtcars) # Structure of object
mtcars
write.table(mtcars, file = "mtcars.tsv",
sep = "\t", row.names = TRUE)
list.files()
getwd()
(infile <- file.choose())
x <- read.delim(infile, header = TRUE)
str(x)
head(x)
tail(x)
x <- read.delim(infile, row.names = NULL)
x
idx <- grep("merc", x$row.names, ignore.case = TRUE)
grepl("Merc", x$row.names)
x[idx, ]
(idx <- grep("Merc", x$row.names, value = TRUE))
x$row.names %in% idx # This in That
subset(x, !row.names %in% idx) # Everything BUT those ...
############################################################
# Knowledge Check 7 (Homework)
# 1. Import spreadsheet table using read_excel (readxl)
# 2. Import spreadsheet table using read.delim("clipboard")
# 3. (Advanced) Import/Export using xlsx package
# - Requires some setup. See
# http://www.r-statistics.com/2012/08/how-to-load-the-rjava-package-after-the-error-java_home-cannot-be-determined-from-the-registry/
############################################################
# The R Environment -------------------------------------------------------
ls() # The Workspace (Environment)
search() # The R "Path" (How Expressions are resolved)
help(package = "utils") # Package documentation
help("read.table") # Function documentation
?plot # Generic Function
??plot # Search documentation
library(splines) # Load Another Package
search() # Changed Search Path
detach(package:splines) # Why would they do this to us?!
search()
# Consider install.packages(pacman)
# p_load(c(MASS, splines, dplyr))
# p_unload(c(MASS, splines, dplyr))
# Open another R Session
install.packages(c("dplyr", "ggplot2", "reshape2"))
# RECAP -------------------------------------------------------------------
# Objectives
# 1. R Data Types
# 2. Indexing
# 3. Boolean Logic and Filtering
# 4. Importing/Exporting
# 5. The R Environment
#
# Functions Used
#
# Constructors: c, list, data.frame
# Coercion: as.data.frame
# sequences: seq, :
# vectorized: sum
# Assignment: <-, =
# Object: class, mode, length, str, attributes, attr
# Dimensions: dim, nrow, ncol
# Names: names, dimnames, rownames, colnames
# Random: sample
# Logical: any, all, &, |, %in%
# Filtering: subset, which,
# Package: library, install.packages
# Environment: ls, rm, data, getwd, search, detach
# File: file.choose, read.delim, write.table, list.files
# Summary: head, tail
# Patterns: grep, grepl
# Help: help, ?, ??
###########################
# Lesson Three - Data Viz
#
# 1. R (Studio) Projects
# 2. Base Graphics
# 3. Grammar of Graphics (ggplot)
###########################
# RStudio Projects --------------------------------------------------------
# Demo Only
# Base Graphics -----------------------------------------------------------
# A must read: http://www.statmethods.net/advgraphs/parameters.html
x <- airquality
hist(x$Ozone, main = "Ozone Distribution", xlab = "Ozone")
boxplot(x$Solar.R, ylab = "Solar Radiation", sub = "Subtitle")
summary(x$Solar.R)
boxplot(x[1:4])
title("Air Quality Boxplot")
plot(rnorm(100), pch = "+", col = "steelblue")
abline(h = 0, col = 'indianred')
points(
jitter(sample(20:60, 100, TRUE)),
jitter(sample(-2:2, 100, TRUE)),
pch = 20)
plot(Ozone ~ jitter(Temp), x, col = "gray40")
lm1 <- lm(Ozone ~ Temp, x) # Linear Regression Model
lo1 <- loess(Ozone ~ Temp, x, span = 0.5) # Loess Model
abline(lm1, col = 'steelblue', lwd=2, lty=2)
s <- do.call(seq, as.list(range(x$Temp)))
print(s)
predict(lm1, data.frame(Temp = s))
predict(lo1, s)
lines(s, predict(lo1, s), col = 'indianred', lwd=2)
hist(x$Ozone, freq = FALSE, main = "Ozone Density", xlab = "")
lines(density(na.omit(x$Ozone)), col = 'indianred', lwd=2)
# Base Graphics Hard ------------------------------------------------------
x <- economics
plot(unemploy ~ date, x, type ='l', las = 1,
xlab = "Time", ylab = "Count", main = "Unemployment")
plot(psavert ~ date, x, type = 'l', las=1,
xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")
par(mfrow = c(1, 2), mar = c(5, 5, 4, 1)+0.1)
plot(unemploy ~ date, x, type = 'l', las = 1,
ylab = "Count", main = "Unemployment",
yaxt = 'n', xlab = '', mgp = c(4,1,0))
axis(side = 2, at = axTicks(2), las = 1,
labels = format(axTicks(2), big.mark = ","))
mtext("Time", side = 1, line = 3)
plot(psavert ~ date, x, type = 'l', las = 1,
xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")
# Grammar of Graphics -----------------------------------------------------
library(ggplot2)
library(reshape2)
ggplot(x) + aes(date, unemploy) + geom_line() + theme_bw()
p <- ggplot(x) + aes(date) + theme_bw()
p + geom_line(aes(y=unemploy)) + ylab("Count")
p + geom_line(aes(y=psavert)) + ylab("Rate (%)")
x <- melt(x, id.vars = "date")
head(x)
head(dcast(x, date ~ variable, value.var = "value"))
ggplot(x) + aes(date, value) + geom_line() +
facet_wrap(~ variable) + theme_bw()
ggplot(subset(x, variable %in% c("psavert", "unemploy"))) +
aes(date, value) + geom_line() + theme_bw() +
facet_wrap(~ variable, scales = "free_y")
# See Also
# dplyr, tidyr
# RECAP
#
# Objectives
# 1. R (Studio) Projects
# 2. Base Graphics
# 3. Grammar of Graphics (ggplot)
#
# Functions used
#
# plot, hist, boxplot
# points, lines, abline
# par, title, mtext, axis, axTicks
# lm, loess, density
# ggplot, aes, geom_line, theme_bw, facet_wrap, ylab
# melt, dcast
#####################################
# Lesson 2 - Data Wrangling
#
# Learning Objectives
# 1. Iterations
# 2. User-Defined Functions (UDFs)
# 3. Data Profiling
# 4. Multiple Assignments
# 5. Missing Values
#####################################
# Iterative Processing ----------------------------------------------------
(v = sample(1:10, 10, TRUE))
x = vector("numeric", 10) # Pre-allocate result vector
for (n in seq_along(v))
{
if (v[n] %% 2 == 0) # If v[n] is Even
{
x[n] = v[n] / 2
} else
{
x[n] = v[n]^2
}
}
square = function(x) {x^2}
halve = function(x) {x/2}
ifelse(v %% 2 == 0, halve(v), square(v)) # Vectorized
x
f = function(n = 10)
{
x = vector("numeric", n)
for (i in seq_along(x))
x[i] <- rnorm(1)
return (x)
}
f()
rnorm(10)
#################################################
# Knowledge Check 1
# 1. Define 2 functions that manipulate integers
# 2. Use ifelse logic to apply 1 function to
# each even *position* in the vector x and
# use the other function to each odd position
#################################################
# Data Profiling ----------------------------------------------------------
x <- mtcars
y <- CO2
head(x)
str(x)
head(y)
str(y)
summary(x)
summary(y)
table(x$cyl)
table(y$Plant)
table(x$cyl) / nrow(x)
prop.table(table(x$cyl))
# More vectorization
apply(x, 2, summary)
apply(x, 1, mean)
is.na(x)
any(is.na(x))
lapply(airquality, function(x) any(is.na(x)))
any_na = function(x) any(is.na(x))
sapply(airquality, any_na)
x <- scale(airquality)
head(x)
str(x)
x <- airquality
x[] <- scale(airquality)
head(x)
str(x)
x <- mtcars
x[] <- lapply(x, function(x) x - mean(x, na.rm = TRUE))
head(x)
head(scale(mtcars, TRUE, FALSE))
################################################
# Knowledge Check 2 (Homework)
# 1. Create a function to count distinct values of a
# vector and apply it to all columns its relevant to
# 2. Explore tapply and by for doing group-wise *apply
# operations
# 3. (Challenge) Create a summary function and apply it
# to a column or columns by some group. Feel free to
# use what you do in 1 and 2 directly
################################################
# Factors and Relabeling --------------------------------------------------
# For more, see the recode or Recode (car package)
# More user-friendly string functions see stringr package
paste("Q", 1:5, sep = ".")
x <- data.frame(
Question = rep(paste0("Q", 1:20), each = 10),
Response = sample(1:7, 20*10, replace = TRUE)
)
head(x)
x$Subject <- rep(1:10, length.out = 20*10)
head(x)
with(x, table(Subject, Question))
str(x)
x <- transform(x, Response_f = factor(Response))
str(x)
levels(x$Response_f)
levels(x$Response_f) <- c(rep("Low", 3), "Neutral", rep("High", 3))
levels(x$Response_f)
with(x, table(Response, Response_f))
x$Response <- as.character(x$Response_f)
table(x$Response)
x <- mtcars
x$row.names <- rownames(x)
x[grepl("Merc", x$row.names), 'cyl'] <- 99
x[grep("\\d", x$row.names), 'row.names'] <- 'NUMBER'
# Also grep("[0-9]", ...) works, too
View(x)
# Missing Values ----------------------------------------------------------
x <- trees
N <- nrow(x)
x_full <- x
x$Girth[sample(1:N, 6)] <- NA
x$Height[sample(1:N, 6)] <- NA
x$Volume[sample(1:N, 6)] <- NA
x
# Solution 1 - Averaging
means <- lapply(x, mean, na.rm= TRUE) # colMeans
x$Girth[is.na(x$Girth)]
x$Girth[is.na(x$Girth)] <- means$Girth
print(x)
cat("Error: ", sum((x$Girth - x_full$Girth)^2))
############################################
# Knowledge Check 3 (Challenging)
# 1. Define a function that
# a. Computes the avg of a vector
# b. Indexes the missing values of a vector
# c. Replaces the missing values with the avg
# 2. *Apply* your function to each column of x
# 3. (Extra) Compute the error for each column and overall
############################################
# Solution 2 - Imputation
x <- x_full
x$Girth[sample(1:N, 6)] <- NA
fit <- lm(Girth ~ Height + Volume, x)
summary(fit)
coef(fit)
missing <- x[is.na(x$Girth), -1]
predict(fit, missing)
x[is.na(x$Girth), 1] <- predict(fit, missing)
cat("Error: ", sum((x$Girth - x_full$Girth)^2))
##########################################
# Knowledge Check 4 (Homework)
# Create a function that takes in a data frame
# and a formula, imputes the missing values
# of the dependent (LHS) variable using the
# indicated predictors (RHS) variables
##########################################
# RECAP
# Objectives
# Learning Objectives
# 1. Iterations
# 2. User-Defined Functions (UDFs)
# 3. Data Profiling
# 4. Multiple Assignments
# 5. Missing Values
#
# Functions Used
#
# Construction: vector, rep, seq_along
# Mathematical: %%, /, ^
# Control Flow: function, for, if else
# Vectorized: summary, scale, paste, is.na , ifelse
# Tabulation: table, prop.table
# Iterators: apply, lapply, sapply
# Manipulation: transform, as.character, with
# Factors: factor, levels
# Models: lm, summary.lm, coef, predict
# Other: cat, rnorm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment