dantalus/Morning1

## Morning1

  install.packages("tidyverse")
  library(tidyverse)

# Objects ####

# Most of R, from an applied point of view anyway, is the process of creating
# objects and feeding them into functions to make amazing, new objects.

# amazing_new_object <- f(object)

  x <- c(3, 4, 5)
  y <- mean(x)

# This is true in the big picture sense as well
# information
# dataframe <- f(information)
# plot      <- f(dataframe)
# model     <- f(dataframe)
# table     <- f(model)
# report    <- f(plot, table)

# But before we get to that point...
# The first objects we typically work with can be described as data structures,
# and these can hold different types of data:

  typeof(1)      # double
  typeof("Yes")  # character
  typeof(FALSE)  # logical

# Missing values are represented by NA

  c(1, 2, NA, 4)

# Types of data structures:

# Vectors - a one dimensional set of values that must all be of the same type.
  c(1, 2, 3, 3.277465) # All numbers
  c("Yes", "No", "1")  # All characters
  c(FALSE, TRUE)       # All logical
  c(F, T)              # All logical (Warning - spell them out)


# Matrices/Arrays - two or more dimensions

  m <- matrix(1:9, 3, 3)

  dim(m)

  m

# Vectors, matrices, and arrays can only contain 1 kind of data. This is
# important to understand.

# What happens when we create a vector with multiple types of data?
  v <- c(1, TRUE, "Yes")
  typeof(v)
  v

# You can see that the 1 and TRUE were converted to "1" and "TRUE"

# Lists are a special type of vector that allow us to combine different types of
# data.

  v <- list(1, TRUE, "Yes")
  typeof(v[[1]]) # double
  typeof(v[[2]]) # logical
  typeof(v[[3]]) # character
  typeof(v)      # list
  class(v)
  class(1)

  typeof(c(1, 2.2))
  class(c(1, 2.2))

  typeof(c(1, 2))
  class(c(1, 2))

  typeof(1:3)
  typeof(c(1:3))
  typeof(c(1, 2, 3))


# Dataframes are in turn special types of lists that correspond to the concept
# of a dataset (a rectangular matirx of values, with observations in rows and
# variables in columns, perhaps with some labels or other metadata attached)

  data <- data.frame(number = rnorm(50),
                     char   = sample(letters,        50, replace = TRUE),
                     logic  = sample(c(TRUE, FALSE), 50, replace = TRUE))

  View(data)

  save(data, v, m, x, y, file = "data.RData") # Save an object of set of objects
  rm(list=ls()) # Remove all the objects in the environment
  load("data.RData") # Bring those objects back

# There are many functions to help us better understand objects

  mode(data)
  typeof(data)
  class(data)

  class(data) <- c(class(data), "bob") # We can assign classes

  str(data) # Structure

  reg <- lm(number ~ char, data = data) # A nonsense regression model
  reg
  summary(reg)
  str(reg)
  typeof(reg)
  class(reg)

  View(data)
  utils::View(data) # When you want an un-constricted view

  names(data)
  attributes(data)
  dim(data)
  length(data)
  nrow(data)
  ncol(data)

  is.character("x")
  is.numeric(1)
  x <- factor(c(1, 2))
  is.factor(x)
  is.logical(FALSE)

  is.na(c(1, 2, NA, 4))


# ! will reverse logical values

  !is.na(c(1, 2, NA, 4))


# Subsetting ####

# Part of working with R is being table to take apart objects and rearrange the
# parts.

# Indexing

# One-dimension
  vec <- sample(c(0:9), 100, replace = TRUE)
  vec[2]

# Two dimensions
  mat <- matrix(c(1, 2, 3, 3, 2, 1), ncol = 2)
  matrix[1, 2]

# Lists
  x <- list(a = c(1, 2), b = c(4, 4), c = c(6, 8), d = c(9, 11))

  x[[1]]
  x[[1]][1]
  x[1]

# These give different results
  class(x[[1]])
  str(x[[1]])

  class(x[1])
  str(x[1])
  attributes(x[1])

# $ for named elements in a list
  x$a
  class(x$a)
  x$a[1]


# Selecting multiple elements
  x <- letters
  x[c(1, 2, 6)]

  x <- sample(c(0:9), 100, replace = TRUE)
  x[x < 5]
  x < 5

  d <- data_frame(number    = sample(0:9, 100, replace = TRUE),
                  character = rep(c("a", "b"), 50))

  lapply(d, class)

  d[unlist(lapply(d, is.numeric))] %>% head()
  d[       sapply(d, is.numeric)]  %>% head()

# You can name elements in data structures besides lists.
  x <- c(    1,     4,     6,     9)
  str(x)
  x <- c(a = 1, b = 4, c = 6, d = 9)
  str(x)
  names(x)
  attributes(x)
  attr(x, "description") <- "This is a named vector"
  attributes(x)

# But $ only works with lists
  x$a
  x <- list(a = 1, b = 4, c = 6, d = 9)
  x$a


# Making and combining objects ####

  ?c
  ?matrix
  ?array
  ?list

  seq_along(c(1:20))
  seq_along(c(100:120))

  seq(from = 0, to = 100, by = 10)
  seq(0, 100, 10)

  rep(c(1, 2), times = 100)
  rep(c(1, 2), each = 100)


# Combining different data types can be tricky

# Differnt data types willl typically reduce to the type with the lowest level
# of information

  x <- c(1, "character")
  x
  class(x)

  x <- c(1, TRUE, "character")
  x
  class(x)

  x <- c(1, TRUE, FALSE)
  x
  class(x)


# No problem with a list
  x <- list(1, TRUE, "character")
  x
  class(x)

# Vectors can be combined to make matrices, but be careful
# R will extend a shorter vector to match a longer one, thus creating data you
# you might not expect.

  ?rbind

  length(rbind(sample(0:9, 100, replace = TRUE)))
  length(c("a", "b"))

  m <- rbind(sample(0:9, 100, replace = TRUE),
             c("a", "b"))
  class(m)
  View(m)

# Dataframes will prevent you from doing this

  m <- rbind(sample(0:9, 100, replace = TRUE),
             c("a", "b")) %>% as.data.frame() # Not this way

  m <- data_frame(sample(0:9, 100, replace = TRUE),
                  c("a", "b")) # Error, which is correct

# We can also combine by columns
  cbind?

  m <- cbind(sample(0:9,     100, replace = TRUE),
             sample(letters, 100, replace = TRUE))
  class(m)
  View(m)

# There are other functions to help switch between information types
  as.character(1)
  as.numeric("1")
  as.numeric("dog")
  as.factor(1)


# We can put strings together with paste.

  paste(letters, "hello", sep = "_")
  paste(letters, letters, sep = "_")
  paste(letters, c("yes", "no"), sep = "_")

  paste0(letters, "hello")

# We can sample and simulate data

  sample(letters, size = 100, replace = TRUE)
  sample(letters, size = 10,  replace = FALSE)

  rnorm(10,    mean = 0, sd = 1) %>% qplot()

  rnorm(10000,        0,      1) %>% qplot()

  d <- data_frame(A = sample(c(0:9), size = 100, replace = TRUE),
                  B = sample(c(0:9), size = 100, replace = TRUE),
                  C = sample(c(0:9), size = 100, replace = TRUE),
                  D = sample(c(0:9), size = 100, replace = TRUE))

  d$total <- d$A + d$B + d$C + d$D

  d$total2 <- rowSums(d[c(1:4)])
  d$mean  <- d$total / 4
  d$mean2 <- rowMeans(d[c(1:4)])

  ggplot(d, aes(x = mean, y = mean2)) + geom_point()

  x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
  x[duplicated(x)]

  x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
  x[unique(x)]
  length(x) - length(unique(x))


# Factors ####

# Factors are a special kind of numeric variable with labels attached to each
# value, signifying categorical (nominal, ordered) data.

  f <- sample(c("Yes", "No", "Maybe"), size = 100, replace = TRUE,
              prob = c(0.3, 0.6, 0.1))

  f.1 <- factor(f)

# The "levels" are the labels

  levels(f.1)
  table(f.1)

# Confirm the structure
  str(f.1)

# The underlying numbers:
  table(as.numeric(f.1))

# The order of the levels matters. By default, they will be in alphabetial order
  sample(letters[c(1, 5, 8)], size = 100, replace = TRUE) %>%
    factor() %>%
    levels()

  sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
    factor() %>%
    class()

  sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
    factor(levels = c("e", "h", "a"))

  sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
    factor(levels = c("e", "h", "a"), ordered = TRUE)

  sample(c(1, 2, 10, 20, 100), size = 100, replace = TRUE) %>%
    factor()

  sample(as.character(c(1, 2, 10, 20, 100)), size = 100, replace = TRUE) %>%
    factor()

# Reordering levels
  table(f.1)
  levels(f.1) <- c("Yes", "No", "Maybe")
  table(f.1) # Bad!

  levels(f.1) <- rev(levels(f.1)) # Switch it back
  table(f.1)

# Do it with factor()
  f.1 <- factor(f, levels = rev(levels(f.1)))
  table(f.1) # Correct

  f.1 <- relevel(f.1, ref = "Maybe")
  table(f.1)

  table(as.numeric(f.1)) # Convert to the underlying number

# Do it manually
  f.1 <- factor(f, levels = c("Maybe", "Yes", "No"))
  table(f.1) # Correct

# You need to use the exising levels
  f.1 <- factor(f, levels = c("A", "B", "C")) # Bad

  f.1 <- factor(f)
  f.2 <- factor(f, labels = c("A", "B", "C")) # Use the labels option

  table(f.1, f.2)

  levels(f.2) # The labels become the levels forevermore

# Numbers as factors

  f <- sample(c(10, 20, 50, 60, 65, 90), size = 100, replace = TRUE)

  f.1 <- factor(f)
  levels(f.1)
  str(f.1)
  f.1 %>% as.numeric() %>% table() # No

  as.numeric(levels(f.1)[f.1]) %>% table() # Yes

  f.1 <- cut(f, 4) # Equally spaced levels
  table(f.1)
  str(f.1)

  f.1 <- cut(f, 4, labels = c("Low", "Med", "High", "Very High"))
  table(f.1)
  levels(f.1)
  as.character(f.1)

# ~ equally sized levels
  f.1 <- cut(f, breaks = quantile(f, 0:4/4))
  table(f.1)
  levels(f.1)

# User defined cuts
  bmi <- rnorm(100, 2,6, 4)

  qplot(bmi)

  bmi <- cut(bmi, c(0, 18.5, 25, 30, max(bmi)),
             labels = c("UW", "NW", "OW", "OB"))

  table(bmi)

# Reordering levels based on other values

  data <- data_frame(number = rnorm(100, 0, 1),
                     factor = factor(sample(letters[1:5], 100, replace = TRUE)))


  levels(data$factor)

  data <- group_by(data, factor) %>%
    summarise(mean = mean(number)) %>%
    full_join(data, by = "factor")

  table(data$factor, data$mean)

  ggplot(data, aes(x = factor, fill = mean)) +
    geom_bar()

  data$factor <- reorder(data$factor, data$mean)

  levels(data$factor)

  ggplot(data, aes(x = factor, fill = mean)) +
    geom_bar()

  data <- group_by(data, factor) %>%
    summarise(count = n()) %>%
    full_join(data, by = "factor")

  data$factor <- reorder(data$factor, data$count)

  ggplot(data, aes(x = factor, fill = mean)) +
    geom_bar()

# Describing data ####

  bmi <- rnorm(100, 26, 4)

  mean(bmi)

  min(bmi)

  max(bmi)

  quantile(bmi, 0.50, na.rm = TRUE)

  quantile(bmi, seq(0.05, 0.95, by = 0.05), na.rm = TRUE)

  bmi[bmi > 30 & bmi < 32 & !is.na(bmi)] <- NA

  !is.na(bmi)

  bmi[is.na(bmi)] %>% length()

  mean(bmi)
  mean(bmi, na.rm = TRUE)

	install.packages("tidyverse")
	library(tidyverse)

	# Objects ####

	# Most of R, from an applied point of view anyway, is the process of creating
	# objects and feeding them into functions to make amazing, new objects.

	# amazing_new_object <- f(object)

	x <- c(3, 4, 5)
	y <- mean(x)

	# This is true in the big picture sense as well
	# information
	# dataframe <- f(information)
	# plot <- f(dataframe)
	# model <- f(dataframe)
	# table <- f(model)
	# report <- f(plot, table)

	# But before we get to that point...
	# The first objects we typically work with can be described as data structures,
	# and these can hold different types of data:

	typeof(1) # double
	typeof("Yes") # character
	typeof(FALSE) # logical

	# Missing values are represented by NA

	c(1, 2, NA, 4)

	# Types of data structures:

	# Vectors - a one dimensional set of values that must all be of the same type.
	c(1, 2, 3, 3.277465) # All numbers
	c("Yes", "No", "1") # All characters
	c(FALSE, TRUE) # All logical
	c(F, T) # All logical (Warning - spell them out)


	# Matrices/Arrays - two or more dimensions

	m <- matrix(1:9, 3, 3)

	dim(m)

	m

	# Vectors, matrices, and arrays can only contain 1 kind of data. This is
	# important to understand.

	# What happens when we create a vector with multiple types of data?
	v <- c(1, TRUE, "Yes")
	typeof(v)
	v

	# You can see that the 1 and TRUE were converted to "1" and "TRUE"

	# Lists are a special type of vector that allow us to combine different types of
	# data.

	v <- list(1, TRUE, "Yes")
	typeof(v[[1]]) # double
	typeof(v[[2]]) # logical
	typeof(v[[3]]) # character
	typeof(v) # list
	class(v)
	class(1)

	typeof(c(1, 2.2))
	class(c(1, 2.2))

	typeof(c(1, 2))
	class(c(1, 2))

	typeof(1:3)
	typeof(c(1:3))
	typeof(c(1, 2, 3))


	# Dataframes are in turn special types of lists that correspond to the concept
	# of a dataset (a rectangular matirx of values, with observations in rows and
	# variables in columns, perhaps with some labels or other metadata attached)

	data <- data.frame(number = rnorm(50),
	char = sample(letters, 50, replace = TRUE),
	logic = sample(c(TRUE, FALSE), 50, replace = TRUE))

	View(data)

	save(data, v, m, x, y, file = "data.RData") # Save an object of set of objects
	rm(list=ls()) # Remove all the objects in the environment
	load("data.RData") # Bring those objects back

	# There are many functions to help us better understand objects

	mode(data)
	typeof(data)
	class(data)

	class(data) <- c(class(data), "bob") # We can assign classes

	str(data) # Structure

	reg <- lm(number ~ char, data = data) # A nonsense regression model
	reg
	summary(reg)
	str(reg)
	typeof(reg)
	class(reg)

	View(data)
	utils::View(data) # When you want an un-constricted view

	names(data)
	attributes(data)
	dim(data)
	length(data)
	nrow(data)
	ncol(data)

	is.character("x")
	is.numeric(1)
	x <- factor(c(1, 2))
	is.factor(x)
	is.logical(FALSE)

	is.na(c(1, 2, NA, 4))


	# ! will reverse logical values

	!is.na(c(1, 2, NA, 4))


	# Subsetting ####

	# Part of working with R is being table to take apart objects and rearrange the
	# parts.

	# Indexing

	# One-dimension
	vec <- sample(c(0:9), 100, replace = TRUE)
	vec[2]

	# Two dimensions
	mat <- matrix(c(1, 2, 3, 3, 2, 1), ncol = 2)
	matrix[1, 2]

	# Lists
	x <- list(a = c(1, 2), b = c(4, 4), c = c(6, 8), d = c(9, 11))

	x[[1]]
	x[[1]][1]
	x[1]

	# These give different results
	class(x[[1]])
	str(x[[1]])

	class(x[1])
	str(x[1])
	attributes(x[1])

	# $ for named elements in a list
	x$a
	class(x$a)
	x$a[1]


	# Selecting multiple elements
	x <- letters
	x[c(1, 2, 6)]

	x <- sample(c(0:9), 100, replace = TRUE)
	x[x < 5]
	x < 5

	d <- data_frame(number = sample(0:9, 100, replace = TRUE),
	character = rep(c("a", "b"), 50))

	lapply(d, class)

	d[unlist(lapply(d, is.numeric))] %>% head()
	d[ sapply(d, is.numeric)] %>% head()

	# You can name elements in data structures besides lists.
	x <- c( 1, 4, 6, 9)
	str(x)
	x <- c(a = 1, b = 4, c = 6, d = 9)
	str(x)
	names(x)
	attributes(x)
	attr(x, "description") <- "This is a named vector"
	attributes(x)

	# But $ only works with lists
	x$a
	x <- list(a = 1, b = 4, c = 6, d = 9)
	x$a


	# Making and combining objects ####

	?c
	?matrix
	?array
	?list

	seq_along(c(1:20))
	seq_along(c(100:120))

	seq(from = 0, to = 100, by = 10)
	seq(0, 100, 10)

	rep(c(1, 2), times = 100)
	rep(c(1, 2), each = 100)


	# Combining different data types can be tricky

	# Differnt data types willl typically reduce to the type with the lowest level
	# of information

	x <- c(1, "character")
	x
	class(x)

	x <- c(1, TRUE, "character")
	x
	class(x)

	x <- c(1, TRUE, FALSE)
	x
	class(x)


	# No problem with a list
	x <- list(1, TRUE, "character")
	x
	class(x)

	# Vectors can be combined to make matrices, but be careful
	# R will extend a shorter vector to match a longer one, thus creating data you
	# you might not expect.

	?rbind

	length(rbind(sample(0:9, 100, replace = TRUE)))
	length(c("a", "b"))

	m <- rbind(sample(0:9, 100, replace = TRUE),
	c("a", "b"))
	class(m)
	View(m)

	# Dataframes will prevent you from doing this

	m <- rbind(sample(0:9, 100, replace = TRUE),
	c("a", "b")) %>% as.data.frame() # Not this way

	m <- data_frame(sample(0:9, 100, replace = TRUE),
	c("a", "b")) # Error, which is correct

	# We can also combine by columns
	cbind?

	m <- cbind(sample(0:9, 100, replace = TRUE),
	sample(letters, 100, replace = TRUE))
	class(m)
	View(m)

	# There are other functions to help switch between information types
	as.character(1)
	as.numeric("1")
	as.numeric("dog")
	as.factor(1)


	# We can put strings together with paste.

	paste(letters, "hello", sep = "_")
	paste(letters, letters, sep = "_")
	paste(letters, c("yes", "no"), sep = "_")

	paste0(letters, "hello")

	# We can sample and simulate data

	sample(letters, size = 100, replace = TRUE)
	sample(letters, size = 10, replace = FALSE)

	rnorm(10, mean = 0, sd = 1) %>% qplot()

	rnorm(10000, 0, 1) %>% qplot()

	d <- data_frame(A = sample(c(0:9), size = 100, replace = TRUE),
	B = sample(c(0:9), size = 100, replace = TRUE),
	C = sample(c(0:9), size = 100, replace = TRUE),
	D = sample(c(0:9), size = 100, replace = TRUE))

	d$total <- d$A + d$B + d$C + d$D

	d$total2 <- rowSums(d[c(1:4)])
	d$mean <- d$total / 4
	d$mean2 <- rowMeans(d[c(1:4)])

	ggplot(d, aes(x = mean, y = mean2)) + geom_point()

	x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
	x[duplicated(x)]

	x <- c(c(1:10), rep(c(1, 2, 3), each = 2))
	x[unique(x)]
	length(x) - length(unique(x))


	# Factors ####

	# Factors are a special kind of numeric variable with labels attached to each
	# value, signifying categorical (nominal, ordered) data.

	f <- sample(c("Yes", "No", "Maybe"), size = 100, replace = TRUE,
	prob = c(0.3, 0.6, 0.1))

	f.1 <- factor(f)

	# The "levels" are the labels

	levels(f.1)
	table(f.1)

	# Confirm the structure
	str(f.1)

	# The underlying numbers:
	table(as.numeric(f.1))

	# The order of the levels matters. By default, they will be in alphabetial order
	sample(letters[c(1, 5, 8)], size = 100, replace = TRUE) %>%
	factor() %>%
	levels()

	sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
	factor() %>%
	class()

	sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
	factor(levels = c("e", "h", "a"))

	sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>%
	factor(levels = c("e", "h", "a"), ordered = TRUE)

	sample(c(1, 2, 10, 20, 100), size = 100, replace = TRUE) %>%
	factor()

	sample(as.character(c(1, 2, 10, 20, 100)), size = 100, replace = TRUE) %>%
	factor()

	# Reordering levels
	table(f.1)
	levels(f.1) <- c("Yes", "No", "Maybe")
	table(f.1) # Bad!

	levels(f.1) <- rev(levels(f.1)) # Switch it back
	table(f.1)

	# Do it with factor()
	f.1 <- factor(f, levels = rev(levels(f.1)))
	table(f.1) # Correct

	f.1 <- relevel(f.1, ref = "Maybe")
	table(f.1)

	table(as.numeric(f.1)) # Convert to the underlying number

	# Do it manually
	f.1 <- factor(f, levels = c("Maybe", "Yes", "No"))
	table(f.1) # Correct

	# You need to use the exising levels
	f.1 <- factor(f, levels = c("A", "B", "C")) # Bad

	f.1 <- factor(f)
	f.2 <- factor(f, labels = c("A", "B", "C")) # Use the labels option

	table(f.1, f.2)

	levels(f.2) # The labels become the levels forevermore

	# Numbers as factors

	f <- sample(c(10, 20, 50, 60, 65, 90), size = 100, replace = TRUE)

	f.1 <- factor(f)
	levels(f.1)
	str(f.1)
	f.1 %>% as.numeric() %>% table() # No

	as.numeric(levels(f.1)[f.1]) %>% table() # Yes

	f.1 <- cut(f, 4) # Equally spaced levels
	table(f.1)
	str(f.1)

	f.1 <- cut(f, 4, labels = c("Low", "Med", "High", "Very High"))
	table(f.1)
	levels(f.1)
	as.character(f.1)

	# ~ equally sized levels
	f.1 <- cut(f, breaks = quantile(f, 0:4/4))
	table(f.1)
	levels(f.1)

	# User defined cuts
	bmi <- rnorm(100, 2,6, 4)

	qplot(bmi)

	bmi <- cut(bmi, c(0, 18.5, 25, 30, max(bmi)),
	labels = c("UW", "NW", "OW", "OB"))

	table(bmi)

	# Reordering levels based on other values

	data <- data_frame(number = rnorm(100, 0, 1),
	factor = factor(sample(letters[1:5], 100, replace = TRUE)))


	levels(data$factor)

	data <- group_by(data, factor) %>%
	summarise(mean = mean(number)) %>%
	full_join(data, by = "factor")

	table(data$factor, data$mean)

	ggplot(data, aes(x = factor, fill = mean)) +
	geom_bar()

	data$factor <- reorder(data$factor, data$mean)

	levels(data$factor)

	ggplot(data, aes(x = factor, fill = mean)) +
	geom_bar()

	data <- group_by(data, factor) %>%
	summarise(count = n()) %>%
	full_join(data, by = "factor")

	data$factor <- reorder(data$factor, data$count)

	ggplot(data, aes(x = factor, fill = mean)) +
	geom_bar()

	# Describing data ####

	bmi <- rnorm(100, 26, 4)

	mean(bmi)

	min(bmi)

	max(bmi)

	quantile(bmi, 0.50, na.rm = TRUE)

	quantile(bmi, seq(0.05, 0.95, by = 0.05), na.rm = TRUE)

	bmi[bmi > 30 & bmi < 32 & !is.na(bmi)] <- NA

	!is.na(bmi)

	bmi[is.na(bmi)] %>% length()

	mean(bmi)
	mean(bmi, na.rm = TRUE)