bryangoodrich/LessonOne.R

## LessonOne.R
#########################################
# Lesson 1 - R Basics
#
# Learning Objectives
# 1. R Data Types
# 2. Indexing
# 3. Boolean Logic and Filtering
# 4. Importing/Exporting
# 5. The R Environment
#########################################


# The Vector ----------------------------

c(1, 2, 3)       # Combines elements into a Vector
seq(1, 3)        # Sequence function
seq(3)           # 1 parameter version
1:3              # sequence shortcut
1-2:3            # Unexpected consequences
(1-2):3          # Order of Operations
seq(1-2, 3)      # May be easier to use function
sum(1:10000)     # Vectorized functions

s                # Error "object 's' not found
sum(s)           # Ditto
s <- "sentence"  # A string
s
s <- strsplit(s, "")[[1]]   # Split it apart
s

class(s)  # Type of Object
mode(s)   # Type of Storage Mode
length(s) # How big is this Vector
sum(s)    # Errors when incompatible types (class)

##########################################
# Knowledge Check 1
# 1. Make a vector.
#    Yes, that's it. Make a vector.
#    Any vector. Assign it to a varaible.
#    Look at its class, mode, and length
##########################################


# Named Vectors and Indexing ---------------------------------------------

c("Al" = 30, "Beth" = 24, "Charlie" = 40)
names(s)                    # No Name Attribute
NULL
NA
c(1, NA, 3)                 # NA is Missing
c(1, NULL, 3)               # NULL is VOID
1:length(s)                 # Numeric Integer Sequence
1:(length(s) - 2)           # Data-driven design
LETTERS                     # Constant - Uppercase Letters
letters                     # Constant - Lowercase Letters
TRUE                        # Constant - Boolean True
F                           # Constant - Abbreviated FALSE
pi                          # Constant - PI
month.name                  # Constant - Month Names
month.abb                   # Constant - Month Abbreviations
month.abb[3]                # Numeric Point Indexing
month.abb[1:3]              # Range Index
month.abb[c(1, 6, 12)]      # Discontinuous Range Index
month.abb[-1]               # Negative Index
names(s) <- month.abb[1:length(s)]
s
names(s)
s["Jan"]                    # Named Indexing
s[c("Jan", "Not Here")]
s[100]

########################################################
# Knowledge Check 2
# 1. Create a vector representing your name
#    [1] "b" "r" "y" "a" "n"
# 2. Index this vector to return the *last* 3 elements
#    [1] "y" "a" "n"
########################################################


# Dimensional Vectors - The Matrix ----------------------------------------

s <- sample(1:100, size = 8)
s
dim(s)
dim(s) <- c(4, 2)
s
class(s)
mode(s)
length(s)
nrow(s)
ncol(s)

names(s) <- c("X", "Y")
s                         # WHAT HAPPENED?!

attributes(s)
names(s)
attr(s, "names")
names(s) <- NULL
s

dimnames(s)
colnames(s) <- c("X", "Y")
s
dimnames(s)
rownames(s) <- 1:nrow(s)
s
dimnames(s)
names(dimnames(s))
names(dimnames(s)) <- c("A", "B")
dimnames(s)
s

######################################################
# Knowledge Check 3 (Homework)
# 1. Create a random 20-element numeric vector
# 2. Coerce it into a 5x4 matrix
# 3. Label the rows Y2001 through Y2005
# 4. Label the columns Q1 through Q4
# 5. Name the dimnames Year and Quarter, respectively
######################################################


# Dimensional Indexing ----------------------------------------------------

s[1, 1]   # Matrix Point index
s[, 1]    # Column Index - Returns Vector (lower dim)
s[, 1, drop = FALSE] # Maintain Object Structure
s[2:3, ]  # Row Ranged Index
s[, "Y"]  # Named Column Index
s["4", ]  # Named Row Index
s[4, ,drop = FALSE] # Row Index with Structure
s[, -2, drop = FALSE] # Negative Index
s[1, 1] <- 999
s
s[length(s)] <- 222
s

#######################################
# Knowledge Check 4
# 1. Use negative indexing to print the
#    matrix without the first 2 rows.
# 2. Replace the bottom row (using nrow)
#    to assign new values to each column
#######################################


# Too-Many-Dimensions Vectors - Arrays -----------------------------------

s = 1:30
class(s)
mode(s)
dim(s) <- c(5, 2, 3)
s
class(s)
mode(s)
length(s)
s[20] <- 100 # Vector Point Index Assignment
s
s[5, 2, 2]   # Array Point Index
s[5, 2, 1:2] # Mixed Indexing
s[5, 2, 1:2, drop = FALSE] # Keep Structure
nrow(s)            # Same as dim(s)[1]
ncol(s)            # Same as dim(s)[2]
dim(s)[3]          # No more helpers
attr(s, "dim")[3]  # For the hardcore programmer

dimnames(s)
dimnames(s) <- list(
    "Rows" = 1:5,
    "Fields" = sample(LETTERS, 2),
    "Group" = c("Ones", "Tens", "Twenties"))
s
s[, , "Ones"] # Named Index
s[2:3, 2, c(1, 3)] # Mixed Index
class(s[2:3, 2, c(1, 3)])
s[2:3, 2, c(1, 3), drop = FALSE]

##############################################
# Knowedge Check 5 (Homework)
# 1. Read the ?matrix help documentation
#    (Recommend also ?vector and ?array)
# 2. Explore creating matrices from a vector
#    setting the byrow parameter both to
#    TRUE and then to FALSE.
# 3. Execute x <- rnorm(20)^2 * 100 to
#    represent a random time series data set
#    of quarterly product earnings over 2001
#    through 2005. Use matrix(x, ...) to
#    create a 5x4 matrix representing the
#    years per row and quarter per column
# 4. Manipulate the dimnames attribute
#    appropriately to give context
##############################################


# Lists and Data Frames ------------------------------------
d = dimnames(s)
d
class(d)
mode(d)
length(d)

d[1]          # List Point Index (Returns List)
d["Rows"]     # Named List Point Index
d$Rows        # Named Accessor (Access List Data)
d[[1]]        # Index Accessor
d[c(1, 3)]    # Returns List
d[[c(1, 3)]]  # DON'T DO THIS (unless you know what you're doing)

x <- list("A" = 1:6, "B" = rnorm(6), "C" = gl(2, 3))
y <- data.frame(A = 1:6, B = rnorm(6), C = gl(2, 3))
x
y
x$C
y$C
class(x)  # List
class(y)  # Data Frame
mode(x)   # List
mode(y)   # List!!

as.data.frame(x)  # DF = "Named List with Equal Length Elements"
print.data.frame  # Class Dispatching (method).(class)
attributes(x) # Simple Object
attributes(y) # Complex Object - No Dimensions!
dim(y)  # But it has dimensions
dim.data.frame  # Special dim function for data frames
.row_names_info # Hidden function
.row_names_info(y, 2L)  # data frame row count
nrow(y)  # Uses dim function; gets dispatched!
length(y)
dim(x) <- dim(y)  # Can you? ...

x[1]    # List Index
y[1]    # Column Index
x[[1]]  # List Accessor
y[[1]]  # Column Accessor

y[, 1]  # Dimensional Index IS Accessor
x[, 1]  # Nonsense!

y[1:3, 1]
y[1:3, c("A", "B")]
y[1:3, 1, drop = FALSE]
y[, 1][1:3]

############################################
# Knowledge Check 6
# Make a Data Frame. Make a List. Go crazy.
# Any Questions?
############################################


# Conditional Indexing and Filtering (Subsetting) -------------------------

x <- ChickWeight
head(ChickWeight)

!TRUE
TRUE & F
any(c(T, T, F))
all(c(T, T, F))
subset(x, Chick == 1)
x$Chick == 1
which(x$Chick == 1)
x[x$Chick == 1, 'weight', drop = FALSE]
subset(x, Chick == 1, select = weight)
subset(x, Chick == 1, select = weight, drop = TRUE)


# Import and Exporting Data -----------------------------------------------
library(help = "datasets")
ls()
rm(list = ls())
ls()

data(mtcars)  # Bring Package data sets to environment
ls()
class(mtcars)
dimnames(mtcars)
str(mtcars)   # Structure of object
mtcars
write.table(mtcars, file = "mtcars.tsv",
            sep = "\t", row.names = TRUE)

list.files()
getwd()
(infile <- file.choose())
x <- read.delim(infile, header = TRUE)
str(x)
head(x)
tail(x)
x <- read.delim(infile, row.names = NULL)
x

idx <- grep("merc", x$row.names, ignore.case = TRUE)
grepl("Merc", x$row.names)
x[idx, ]
(idx <- grep("Merc", x$row.names, value = TRUE))
x$row.names %in% idx  # This in That
subset(x, !row.names %in% idx) # Everything BUT those ...

############################################################
# Knowledge Check 7 (Homework)
# 1. Import spreadsheet table using read_excel (readxl)
# 2. Import spreadsheet table using read.delim("clipboard")
# 3. (Advanced) Import/Export using xlsx package
# - Requires some setup. See
# http://www.r-statistics.com/2012/08/how-to-load-the-rjava-package-after-the-error-java_home-cannot-be-determined-from-the-registry/
############################################################


# The R Environment -------------------------------------------------------
ls()                      # The Workspace (Environment)
search()                  # The R "Path" (How Expressions are resolved)
help(package = "utils")   # Package documentation
help("read.table")        # Function documentation
?plot                     # Generic Function
??plot                    # Search documentation

library(splines)          # Load Another Package
search()                  # Changed Search Path
detach(package:splines)   # Why would they do this to us?!
search()
# Consider install.packages(pacman)
# p_load(c(MASS, splines, dplyr))
# p_unload(c(MASS, splines, dplyr))

# Open another R Session
install.packages(c("dplyr", "ggplot2", "reshape2"))


# RECAP -------------------------------------------------------------------

# Objectives
# 1. R Data Types
# 2. Indexing
# 3. Boolean Logic and Filtering
# 4. Importing/Exporting
# 5. The R Environment
#
# Functions Used
#
# Constructors: c, list, data.frame
# Coercion: as.data.frame
# sequences: seq, :
# vectorized: sum
# Assignment: <-, =
# Object: class, mode, length, str, attributes, attr
# Dimensions: dim, nrow, ncol
# Names: names, dimnames, rownames, colnames
# Random: sample
# Logical: any, all, &, |, %in%
# Filtering: subset, which,
# Package: library, install.packages
# Environment: ls, rm, data, getwd, search, detach
# File: file.choose, read.delim, write.table, list.files
# Summary: head, tail
# Patterns: grep, grepl
# Help: help, ?, ??

## LessonThr.R
###########################
# Lesson Three - Data Viz
#
# 1. R (Studio) Projects
# 2. Base Graphics
# 3. Grammar of Graphics (ggplot)
###########################


# RStudio Projects --------------------------------------------------------

# Demo Only


# Base Graphics -----------------------------------------------------------
# A must read: http://www.statmethods.net/advgraphs/parameters.html

x <- airquality
hist(x$Ozone, main = "Ozone Distribution", xlab = "Ozone")

boxplot(x$Solar.R, ylab = "Solar Radiation", sub = "Subtitle")
summary(x$Solar.R)

boxplot(x[1:4])
title("Air Quality Boxplot")

plot(rnorm(100), pch = "+", col = "steelblue")
abline(h = 0, col = 'indianred')
points(
    jitter(sample(20:60, 100, TRUE)),
    jitter(sample(-2:2, 100, TRUE)),
    pch = 20)

plot(Ozone ~ jitter(Temp), x, col = "gray40")
lm1 <- lm(Ozone ~ Temp, x)     # Linear Regression Model
lo1 <- loess(Ozone ~ Temp, x, span = 0.5)  # Loess Model
abline(lm1, col = 'steelblue', lwd=2, lty=2)
s <- do.call(seq, as.list(range(x$Temp)))
print(s)
predict(lm1, data.frame(Temp = s))
predict(lo1, s)
lines(s, predict(lo1, s), col = 'indianred', lwd=2)

hist(x$Ozone, freq = FALSE, main = "Ozone Density", xlab = "")
lines(density(na.omit(x$Ozone)), col = 'indianred', lwd=2)


# Base Graphics Hard ------------------------------------------------------

x <- economics

plot(unemploy ~ date, x, type ='l', las = 1,
     xlab = "Time", ylab = "Count", main = "Unemployment")
plot(psavert ~ date, x, type = 'l', las=1,
     xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")


par(mfrow = c(1, 2), mar = c(5, 5, 4, 1)+0.1)

plot(unemploy ~ date, x, type = 'l', las = 1,
     ylab = "Count", main = "Unemployment",
     yaxt = 'n', xlab = '', mgp = c(4,1,0))
axis(side = 2, at = axTicks(2), las = 1,
     labels = format(axTicks(2), big.mark = ","))
mtext("Time", side = 1, line = 3)

plot(psavert ~ date, x, type = 'l', las = 1,
     xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")


# Grammar of Graphics -----------------------------------------------------

library(ggplot2)
library(reshape2)

ggplot(x) + aes(date, unemploy) + geom_line() + theme_bw()
p <- ggplot(x) + aes(date) + theme_bw()
p + geom_line(aes(y=unemploy)) + ylab("Count")
p + geom_line(aes(y=psavert)) + ylab("Rate (%)")

x <- melt(x, id.vars = "date")
head(x)

head(dcast(x, date ~ variable, value.var = "value"))

ggplot(x) + aes(date, value) + geom_line() +
    facet_wrap(~ variable) + theme_bw()

ggplot(subset(x, variable %in% c("psavert", "unemploy"))) +
    aes(date, value) + geom_line() + theme_bw() +
    facet_wrap(~ variable, scales = "free_y")


# See Also
# dplyr, tidyr


# RECAP
#
# Objectives
# 1. R (Studio) Projects
# 2. Base Graphics
# 3. Grammar of Graphics (ggplot)
#
# Functions used
#
# plot, hist, boxplot
# points, lines, abline
# par, title, mtext, axis, axTicks
# lm, loess, density
# ggplot, aes, geom_line, theme_bw, facet_wrap, ylab
# melt, dcast

## LessonTwo.R
#####################################
# Lesson 2 - Data Wrangling
#
# Learning Objectives
# 1. Iterations
# 2. User-Defined Functions (UDFs)
# 3. Data Profiling
# 4. Multiple Assignments
# 5. Missing Values
#####################################


# Iterative Processing ----------------------------------------------------

(v = sample(1:10, 10, TRUE))
x = vector("numeric", 10)  # Pre-allocate result vector
for (n in seq_along(v))
{
    if (v[n] %% 2 == 0)  # If v[n] is Even
    {
        x[n] = v[n] / 2
    } else
    {
        x[n] = v[n]^2
    }
}

square = function(x) {x^2}
halve  = function(x) {x/2}

ifelse(v %% 2 == 0, halve(v), square(v))  # Vectorized
x

f = function(n = 10)
{
    x = vector("numeric", n)
    for (i in seq_along(x))
        x[i] <- rnorm(1)
    return (x)
}

f()
rnorm(10)

#################################################
# Knowledge Check 1
# 1. Define 2 functions that manipulate integers
# 2. Use ifelse logic to apply 1 function to
#    each even *position* in the vector x and
#    use the other function to each odd position
#################################################


# Data Profiling ----------------------------------------------------------

x <- mtcars
y <- CO2

head(x)
str(x)

head(y)
str(y)

summary(x)
summary(y)

table(x$cyl)
table(y$Plant)

table(x$cyl) / nrow(x)
prop.table(table(x$cyl))

# More vectorization
apply(x, 2, summary)
apply(x, 1, mean)
is.na(x)
any(is.na(x))

lapply(airquality, function(x) any(is.na(x)))
any_na = function(x) any(is.na(x))
sapply(airquality, any_na)

x <- scale(airquality)
head(x)
str(x)

x <- airquality
x[] <- scale(airquality)
head(x)
str(x)

x <- mtcars
x[] <- lapply(x, function(x) x - mean(x, na.rm = TRUE))
head(x)
head(scale(mtcars, TRUE, FALSE))

################################################
# Knowledge Check 2 (Homework)
# 1. Create a function to count distinct values of a
#    vector and apply it to all columns its relevant to
# 2. Explore tapply and by for doing group-wise *apply
#    operations
# 3. (Challenge) Create a summary function and apply it
#    to a column or columns by some group. Feel free to
#    use what you do in 1 and 2 directly
################################################


# Factors and Relabeling --------------------------------------------------
# For more, see the recode or Recode (car package)
# More user-friendly string functions see stringr package

paste("Q", 1:5, sep = ".")
x <- data.frame(
    Question = rep(paste0("Q", 1:20), each = 10),
    Response = sample(1:7, 20*10, replace = TRUE)
)
head(x)

x$Subject <- rep(1:10, length.out = 20*10)
head(x)
with(x, table(Subject, Question))
str(x)

x <- transform(x, Response_f = factor(Response))
str(x)
levels(x$Response_f)
levels(x$Response_f) <- c(rep("Low", 3), "Neutral", rep("High", 3))
levels(x$Response_f)
with(x, table(Response, Response_f))
x$Response <- as.character(x$Response_f)
table(x$Response)

x <- mtcars
x$row.names <- rownames(x)
x[grepl("Merc", x$row.names), 'cyl'] <- 99
x[grep("\\d", x$row.names), 'row.names'] <- 'NUMBER'
# Also grep("[0-9]", ...) works, too
View(x)


# Missing Values ----------------------------------------------------------

x <- trees
N <- nrow(x)
x_full <- x
x$Girth[sample(1:N, 6)] <- NA
x$Height[sample(1:N, 6)] <- NA
x$Volume[sample(1:N, 6)] <- NA
x

# Solution 1 - Averaging
means <- lapply(x, mean, na.rm= TRUE) # colMeans
x$Girth[is.na(x$Girth)]
x$Girth[is.na(x$Girth)] <- means$Girth
print(x)
cat("Error: ", sum((x$Girth - x_full$Girth)^2))

############################################
# Knowledge Check 3 (Challenging)
# 1. Define a function that
#    a. Computes the avg of a vector
#    b. Indexes the missing values of a vector
#    c. Replaces the missing values with the avg
# 2. *Apply* your function to each column of x
# 3. (Extra) Compute the error for each column and overall
############################################


# Solution 2 - Imputation
x <- x_full
x$Girth[sample(1:N, 6)] <- NA
fit <- lm(Girth ~ Height + Volume, x)
summary(fit)
coef(fit)
missing <- x[is.na(x$Girth), -1]
predict(fit, missing)
x[is.na(x$Girth), 1] <- predict(fit, missing)
cat("Error: ", sum((x$Girth - x_full$Girth)^2))

##########################################
# Knowledge Check 4 (Homework)
# Create a function that takes in a data frame
# and a formula, imputes the missing values
# of the dependent (LHS) variable using the
# indicated predictors (RHS) variables
##########################################


# RECAP
# Objectives
# Learning Objectives
# 1. Iterations
# 2. User-Defined Functions (UDFs)
# 3. Data Profiling
# 4. Multiple Assignments
# 5. Missing Values
#
# Functions Used
#
# Construction: vector, rep, seq_along
# Mathematical: %%, /, ^
# Control Flow: function, for, if else
# Vectorized: summary, scale, paste, is.na , ifelse
# Tabulation: table, prop.table
# Iterators: apply, lapply, sapply
# Manipulation: transform, as.character, with
# Factors: factor, levels
# Models: lm, summary.lm, coef, predict
# Other: cat, rnorm
	#########################################
	# Lesson 1 - R Basics
	#
	# Learning Objectives
	# 1. R Data Types
	# 2. Indexing
	# 3. Boolean Logic and Filtering
	# 4. Importing/Exporting
	# 5. The R Environment
	#########################################



	# The Vector ----------------------------

	c(1, 2, 3) # Combines elements into a Vector
	seq(1, 3) # Sequence function
	seq(3) # 1 parameter version
	1:3 # sequence shortcut
	1-2:3 # Unexpected consequences
	(1-2):3 # Order of Operations
	seq(1-2, 3) # May be easier to use function
	sum(1:10000) # Vectorized functions

	s # Error "object 's' not found
	sum(s) # Ditto
	s <- "sentence" # A string
	s
	s <- strsplit(s, "")[[1]] # Split it apart
	s

	class(s) # Type of Object
	mode(s) # Type of Storage Mode
	length(s) # How big is this Vector
	sum(s) # Errors when incompatible types (class)

	##########################################
	# Knowledge Check 1
	# 1. Make a vector.
	# Yes, that's it. Make a vector.
	# Any vector. Assign it to a varaible.
	# Look at its class, mode, and length
	##########################################



	# Named Vectors and Indexing ---------------------------------------------

	c("Al" = 30, "Beth" = 24, "Charlie" = 40)
	names(s) # No Name Attribute
	NULL
	NA
	c(1, NA, 3) # NA is Missing
	c(1, NULL, 3) # NULL is VOID
	1:length(s) # Numeric Integer Sequence
	1:(length(s) - 2) # Data-driven design
	LETTERS # Constant - Uppercase Letters
	letters # Constant - Lowercase Letters
	TRUE # Constant - Boolean True
	F # Constant - Abbreviated FALSE
	pi # Constant - PI
	month.name # Constant - Month Names
	month.abb # Constant - Month Abbreviations
	month.abb[3] # Numeric Point Indexing
	month.abb[1:3] # Range Index
	month.abb[c(1, 6, 12)] # Discontinuous Range Index
	month.abb[-1] # Negative Index
	names(s) <- month.abb[1:length(s)]
	s
	names(s)
	s["Jan"] # Named Indexing
	s[c("Jan", "Not Here")]
	s[100]

	########################################################
	# Knowledge Check 2
	# 1. Create a vector representing your name
	# [1] "b" "r" "y" "a" "n"
	# 2. Index this vector to return the last 3 elements
	# [1] "y" "a" "n"
	########################################################



	# Dimensional Vectors - The Matrix ----------------------------------------

	s <- sample(1:100, size = 8)
	s
	dim(s)
	dim(s) <- c(4, 2)
	s
	class(s)
	mode(s)
	length(s)
	nrow(s)
	ncol(s)

	names(s) <- c("X", "Y")
	s # WHAT HAPPENED?!

	attributes(s)
	names(s)
	attr(s, "names")
	names(s) <- NULL
	s

	dimnames(s)
	colnames(s) <- c("X", "Y")
	s
	dimnames(s)
	rownames(s) <- 1:nrow(s)
	s
	dimnames(s)
	names(dimnames(s))
	names(dimnames(s)) <- c("A", "B")
	dimnames(s)
	s

	######################################################
	# Knowledge Check 3 (Homework)
	# 1. Create a random 20-element numeric vector
	# 2. Coerce it into a 5x4 matrix
	# 3. Label the rows Y2001 through Y2005
	# 4. Label the columns Q1 through Q4
	# 5. Name the dimnames Year and Quarter, respectively
	######################################################



	# Dimensional Indexing ----------------------------------------------------

	s[1, 1] # Matrix Point index
	s[, 1] # Column Index - Returns Vector (lower dim)
	s[, 1, drop = FALSE] # Maintain Object Structure
	s[2:3, ] # Row Ranged Index
	s[, "Y"] # Named Column Index
	s["4", ] # Named Row Index
	s[4, ,drop = FALSE] # Row Index with Structure
	s[, -2, drop = FALSE] # Negative Index
	s[1, 1] <- 999
	s
	s[length(s)] <- 222
	s

	#######################################
	# Knowledge Check 4
	# 1. Use negative indexing to print the
	# matrix without the first 2 rows.
	# 2. Replace the bottom row (using nrow)
	# to assign new values to each column
	#######################################



	# Too-Many-Dimensions Vectors - Arrays -----------------------------------

	s = 1:30
	class(s)
	mode(s)
	dim(s) <- c(5, 2, 3)
	s
	class(s)
	mode(s)
	length(s)
	s[20] <- 100 # Vector Point Index Assignment
	s
	s[5, 2, 2] # Array Point Index
	s[5, 2, 1:2] # Mixed Indexing
	s[5, 2, 1:2, drop = FALSE] # Keep Structure
	nrow(s) # Same as dim(s)[1]
	ncol(s) # Same as dim(s)[2]
	dim(s)[3] # No more helpers
	attr(s, "dim")[3] # For the hardcore programmer

	dimnames(s)
	dimnames(s) <- list(
	"Rows" = 1:5,
	"Fields" = sample(LETTERS, 2),
	"Group" = c("Ones", "Tens", "Twenties"))
	s
	s[, , "Ones"] # Named Index
	s[2:3, 2, c(1, 3)] # Mixed Index
	class(s[2:3, 2, c(1, 3)])
	s[2:3, 2, c(1, 3), drop = FALSE]

	##############################################
	# Knowedge Check 5 (Homework)
	# 1. Read the ?matrix help documentation
	# (Recommend also ?vector and ?array)
	# 2. Explore creating matrices from a vector
	# setting the byrow parameter both to
	# TRUE and then to FALSE.
	# 3. Execute x <- rnorm(20)^2 * 100 to
	# represent a random time series data set
	# of quarterly product earnings over 2001
	# through 2005. Use matrix(x, ...) to
	# create a 5x4 matrix representing the
	# years per row and quarter per column
	# 4. Manipulate the dimnames attribute
	# appropriately to give context
	##############################################



	# Lists and Data Frames ------------------------------------
	d = dimnames(s)
	d
	class(d)
	mode(d)
	length(d)

	d[1] # List Point Index (Returns List)
	d["Rows"] # Named List Point Index
	d$Rows # Named Accessor (Access List Data)
	d[[1]] # Index Accessor
	d[c(1, 3)] # Returns List
	d[[c(1, 3)]] # DON'T DO THIS (unless you know what you're doing)

	x <- list("A" = 1:6, "B" = rnorm(6), "C" = gl(2, 3))
	y <- data.frame(A = 1:6, B = rnorm(6), C = gl(2, 3))
	x
	y
	x$C
	y$C
	class(x) # List
	class(y) # Data Frame
	mode(x) # List
	mode(y) # List!!

	as.data.frame(x) # DF = "Named List with Equal Length Elements"
	print.data.frame # Class Dispatching (method).(class)
	attributes(x) # Simple Object
	attributes(y) # Complex Object - No Dimensions!
	dim(y) # But it has dimensions
	dim.data.frame # Special dim function for data frames
	.row_names_info # Hidden function
	.row_names_info(y, 2L) # data frame row count
	nrow(y) # Uses dim function; gets dispatched!
	length(y)
	dim(x) <- dim(y) # Can you? ...

	x[1] # List Index
	y[1] # Column Index
	x[[1]] # List Accessor
	y[[1]] # Column Accessor

	y[, 1] # Dimensional Index IS Accessor
	x[, 1] # Nonsense!

	y[1:3, 1]
	y[1:3, c("A", "B")]
	y[1:3, 1, drop = FALSE]
	y[, 1][1:3]

	############################################
	# Knowledge Check 6
	# Make a Data Frame. Make a List. Go crazy.
	# Any Questions?
	############################################



	# Conditional Indexing and Filtering (Subsetting) -------------------------

	x <- ChickWeight
	head(ChickWeight)

	!TRUE
	TRUE & F
	any(c(T, T, F))
	all(c(T, T, F))
	subset(x, Chick == 1)
	x$Chick == 1
	which(x$Chick == 1)
	x[x$Chick == 1, 'weight', drop = FALSE]
	subset(x, Chick == 1, select = weight)
	subset(x, Chick == 1, select = weight, drop = TRUE)



	# Import and Exporting Data -----------------------------------------------
	library(help = "datasets")
	ls()
	rm(list = ls())
	ls()

	data(mtcars) # Bring Package data sets to environment
	ls()
	class(mtcars)
	dimnames(mtcars)
	str(mtcars) # Structure of object
	mtcars
	write.table(mtcars, file = "mtcars.tsv",
	sep = "\t", row.names = TRUE)

	list.files()
	getwd()
	(infile <- file.choose())
	x <- read.delim(infile, header = TRUE)
	str(x)
	head(x)
	tail(x)
	x <- read.delim(infile, row.names = NULL)
	x

	idx <- grep("merc", x$row.names, ignore.case = TRUE)
	grepl("Merc", x$row.names)
	x[idx, ]
	(idx <- grep("Merc", x$row.names, value = TRUE))
	x$row.names %in% idx # This in That
	subset(x, !row.names %in% idx) # Everything BUT those ...

	############################################################
	# Knowledge Check 7 (Homework)
	# 1. Import spreadsheet table using read_excel (readxl)
	# 2. Import spreadsheet table using read.delim("clipboard")
	# 3. (Advanced) Import/Export using xlsx package
	# - Requires some setup. See
	# http://www.r-statistics.com/2012/08/how-to-load-the-rjava-package-after-the-error-java_home-cannot-be-determined-from-the-registry/
	############################################################



	# The R Environment -------------------------------------------------------
	ls() # The Workspace (Environment)
	search() # The R "Path" (How Expressions are resolved)
	help(package = "utils") # Package documentation
	help("read.table") # Function documentation
	?plot # Generic Function
	??plot # Search documentation

	library(splines) # Load Another Package
	search() # Changed Search Path
	detach(package:splines) # Why would they do this to us?!
	search()
	# Consider install.packages(pacman)
	# p_load(c(MASS, splines, dplyr))
	# p_unload(c(MASS, splines, dplyr))

	# Open another R Session
	install.packages(c("dplyr", "ggplot2", "reshape2"))



	# RECAP -------------------------------------------------------------------

	# Objectives
	# 1. R Data Types
	# 2. Indexing
	# 3. Boolean Logic and Filtering
	# 4. Importing/Exporting
	# 5. The R Environment
	#
	# Functions Used
	#
	# Constructors: c, list, data.frame
	# Coercion: as.data.frame
	# sequences: seq, :
	# vectorized: sum
	# Assignment: <-, =
	# Object: class, mode, length, str, attributes, attr
	# Dimensions: dim, nrow, ncol
	# Names: names, dimnames, rownames, colnames
	# Random: sample
	# Logical: any, all, &, \|, %in%
	# Filtering: subset, which,
	# Package: library, install.packages
	# Environment: ls, rm, data, getwd, search, detach
	# File: file.choose, read.delim, write.table, list.files
	# Summary: head, tail
	# Patterns: grep, grepl
	# Help: help, ?, ??
	###########################
	# Lesson Three - Data Viz
	#
	# 1. R (Studio) Projects
	# 2. Base Graphics
	# 3. Grammar of Graphics (ggplot)
	###########################


	# RStudio Projects --------------------------------------------------------

	# Demo Only



	# Base Graphics -----------------------------------------------------------
	# A must read: http://www.statmethods.net/advgraphs/parameters.html

	x <- airquality
	hist(x$Ozone, main = "Ozone Distribution", xlab = "Ozone")

	boxplot(x$Solar.R, ylab = "Solar Radiation", sub = "Subtitle")
	summary(x$Solar.R)

	boxplot(x[1:4])
	title("Air Quality Boxplot")

	plot(rnorm(100), pch = "+", col = "steelblue")
	abline(h = 0, col = 'indianred')
	points(
	jitter(sample(20:60, 100, TRUE)),
	jitter(sample(-2:2, 100, TRUE)),
	pch = 20)

	plot(Ozone ~ jitter(Temp), x, col = "gray40")
	lm1 <- lm(Ozone ~ Temp, x) # Linear Regression Model
	lo1 <- loess(Ozone ~ Temp, x, span = 0.5) # Loess Model
	abline(lm1, col = 'steelblue', lwd=2, lty=2)
	s <- do.call(seq, as.list(range(x$Temp)))
	print(s)
	predict(lm1, data.frame(Temp = s))
	predict(lo1, s)
	lines(s, predict(lo1, s), col = 'indianred', lwd=2)

	hist(x$Ozone, freq = FALSE, main = "Ozone Density", xlab = "")
	lines(density(na.omit(x$Ozone)), col = 'indianred', lwd=2)



	# Base Graphics Hard ------------------------------------------------------

	x <- economics

	plot(unemploy ~ date, x, type ='l', las = 1,
	xlab = "Time", ylab = "Count", main = "Unemployment")
	plot(psavert ~ date, x, type = 'l', las=1,
	xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")


	par(mfrow = c(1, 2), mar = c(5, 5, 4, 1)+0.1)

	plot(unemploy ~ date, x, type = 'l', las = 1,
	ylab = "Count", main = "Unemployment",
	yaxt = 'n', xlab = '', mgp = c(4,1,0))
	axis(side = 2, at = axTicks(2), las = 1,
	labels = format(axTicks(2), big.mark = ","))
	mtext("Time", side = 1, line = 3)

	plot(psavert ~ date, x, type = 'l', las = 1,
	xlab = "Time", ylab = "Rate (%)", main = "Personal Savings")



	# Grammar of Graphics -----------------------------------------------------

	library(ggplot2)
	library(reshape2)

	ggplot(x) + aes(date, unemploy) + geom_line() + theme_bw()
	p <- ggplot(x) + aes(date) + theme_bw()
	p + geom_line(aes(y=unemploy)) + ylab("Count")
	p + geom_line(aes(y=psavert)) + ylab("Rate (%)")

	x <- melt(x, id.vars = "date")
	head(x)

	head(dcast(x, date ~ variable, value.var = "value"))

	ggplot(x) + aes(date, value) + geom_line() +
	facet_wrap(~ variable) + theme_bw()

	ggplot(subset(x, variable %in% c("psavert", "unemploy"))) +
	aes(date, value) + geom_line() + theme_bw() +
	facet_wrap(~ variable, scales = "free_y")


	# See Also
	# dplyr, tidyr


	# RECAP
	#
	# Objectives
	# 1. R (Studio) Projects
	# 2. Base Graphics
	# 3. Grammar of Graphics (ggplot)
	#
	# Functions used
	#
	# plot, hist, boxplot
	# points, lines, abline
	# par, title, mtext, axis, axTicks
	# lm, loess, density
	# ggplot, aes, geom_line, theme_bw, facet_wrap, ylab
	# melt, dcast
	#####################################
	# Lesson 2 - Data Wrangling
	#
	# Learning Objectives
	# 1. Iterations
	# 2. User-Defined Functions (UDFs)
	# 3. Data Profiling
	# 4. Multiple Assignments
	# 5. Missing Values
	#####################################



	# Iterative Processing ----------------------------------------------------

	(v = sample(1:10, 10, TRUE))
	x = vector("numeric", 10) # Pre-allocate result vector
	for (n in seq_along(v))
	{
	if (v[n] %% 2 == 0) # If v[n] is Even
	{
	x[n] = v[n] / 2
	} else
	{
	x[n] = v[n]^2
	}
	}

	square = function(x) {x^2}
	halve = function(x) {x/2}

	ifelse(v %% 2 == 0, halve(v), square(v)) # Vectorized
	x

	f = function(n = 10)
	{
	x = vector("numeric", n)
	for (i in seq_along(x))
	x[i] <- rnorm(1)
	return (x)
	}

	f()
	rnorm(10)

	#################################################
	# Knowledge Check 1
	# 1. Define 2 functions that manipulate integers
	# 2. Use ifelse logic to apply 1 function to
	# each even position in the vector x and
	# use the other function to each odd position
	#################################################



	# Data Profiling ----------------------------------------------------------

	x <- mtcars
	y <- CO2

	head(x)
	str(x)

	head(y)
	str(y)

	summary(x)
	summary(y)

	table(x$cyl)
	table(y$Plant)

	table(x$cyl) / nrow(x)
	prop.table(table(x$cyl))

	# More vectorization
	apply(x, 2, summary)
	apply(x, 1, mean)
	is.na(x)
	any(is.na(x))

	lapply(airquality, function(x) any(is.na(x)))
	any_na = function(x) any(is.na(x))
	sapply(airquality, any_na)

	x <- scale(airquality)
	head(x)
	str(x)

	x <- airquality
	x[] <- scale(airquality)
	head(x)
	str(x)

	x <- mtcars
	x[] <- lapply(x, function(x) x - mean(x, na.rm = TRUE))
	head(x)
	head(scale(mtcars, TRUE, FALSE))

	################################################
	# Knowledge Check 2 (Homework)
	# 1. Create a function to count distinct values of a
	# vector and apply it to all columns its relevant to
	# 2. Explore tapply and by for doing group-wise *apply
	# operations
	# 3. (Challenge) Create a summary function and apply it
	# to a column or columns by some group. Feel free to
	# use what you do in 1 and 2 directly
	################################################




	# Factors and Relabeling --------------------------------------------------
	# For more, see the recode or Recode (car package)
	# More user-friendly string functions see stringr package

	paste("Q", 1:5, sep = ".")
	x <- data.frame(
	Question = rep(paste0("Q", 1:20), each = 10),
	Response = sample(1:7, 20*10, replace = TRUE)
	)
	head(x)

	x$Subject <- rep(1:10, length.out = 20*10)
	head(x)
	with(x, table(Subject, Question))
	str(x)

	x <- transform(x, Response_f = factor(Response))
	str(x)
	levels(x$Response_f)
	levels(x$Response_f) <- c(rep("Low", 3), "Neutral", rep("High", 3))
	levels(x$Response_f)
	with(x, table(Response, Response_f))
	x$Response <- as.character(x$Response_f)
	table(x$Response)

	x <- mtcars
	x$row.names <- rownames(x)
	x[grepl("Merc", x$row.names), 'cyl'] <- 99
	x[grep("\\d", x$row.names), 'row.names'] <- 'NUMBER'
	# Also grep("[0-9]", ...) works, too
	View(x)



	# Missing Values ----------------------------------------------------------

	x <- trees
	N <- nrow(x)
	x_full <- x
	x$Girth[sample(1:N, 6)] <- NA
	x$Height[sample(1:N, 6)] <- NA
	x$Volume[sample(1:N, 6)] <- NA
	x

	# Solution 1 - Averaging
	means <- lapply(x, mean, na.rm= TRUE) # colMeans
	x$Girth[is.na(x$Girth)]
	x$Girth[is.na(x$Girth)] <- means$Girth
	print(x)
	cat("Error: ", sum((x$Girth - x_full$Girth)^2))

	############################################
	# Knowledge Check 3 (Challenging)
	# 1. Define a function that
	# a. Computes the avg of a vector
	# b. Indexes the missing values of a vector
	# c. Replaces the missing values with the avg
	# 2. Apply your function to each column of x
	# 3. (Extra) Compute the error for each column and overall
	############################################



	# Solution 2 - Imputation
	x <- x_full
	x$Girth[sample(1:N, 6)] <- NA
	fit <- lm(Girth ~ Height + Volume, x)
	summary(fit)
	coef(fit)
	missing <- x[is.na(x$Girth), -1]
	predict(fit, missing)
	x[is.na(x$Girth), 1] <- predict(fit, missing)
	cat("Error: ", sum((x$Girth - x_full$Girth)^2))

	##########################################
	# Knowledge Check 4 (Homework)
	# Create a function that takes in a data frame
	# and a formula, imputes the missing values
	# of the dependent (LHS) variable using the
	# indicated predictors (RHS) variables
	##########################################



	# RECAP
	# Objectives
	# Learning Objectives
	# 1. Iterations
	# 2. User-Defined Functions (UDFs)
	# 3. Data Profiling
	# 4. Multiple Assignments
	# 5. Missing Values
	#
	# Functions Used
	#
	# Construction: vector, rep, seq_along
	# Mathematical: %%, /, ^
	# Control Flow: function, for, if else
	# Vectorized: summary, scale, paste, is.na , ifelse
	# Tabulation: table, prop.table
	# Iterators: apply, lapply, sapply
	# Manipulation: transform, as.character, with
	# Factors: factor, levels
	# Models: lm, summary.lm, coef, predict
	# Other: cat, rnorm