dantalus/Afternoon2

## Afternoon2
# Apply functions ####

# There are cases where you want to "do something" to each element in a given
# data structure. For example, we might want to calcuate the
# mean for each variable (column) in a dataframe.

# Looping is a common way to do this.

# Data

# library(tidyverse)

  data <- iris

  View(data)

# Print the mean for the first 4 columns of data

  for (i in 1:4){

    print(mean(data[[i]], na.rm = TRUE))

  }

# Or something slightly more complicated
# Create a new dataframe made up of the standardized values for first 4 columns
# of data.

  results <- list() # Create a "blank" list

  for (i in 1:4){

    m  <- mean(data[[i]], na.rm = TRUE)
    sd <- sd(  data[[i]], na.rm = TRUE)

    results[[i]] <- (data[[i]] - m) / sd # Put results in the list

  }

  results <- do.call(cbind.data.frame, results) # Convert the list to dataframe

  library(dplyr)
  library(tidyr)

# Plot the original data
  gather(data[, 1:4], var, value) %>%
    ggplot(aes(x = value, color = var, fill = var)) +
    geom_density() +
    facet_wrap(~var)

# Plot the standardized values
  gather(results, var, value) %>%
    ggplot(aes(x = value, color = var, fill = var)) +
    geom_density() +
    facet_wrap(~var)

# The argument against loops - Just google "Why shouldn't I use for loops r"
# for a deluge of reasons. I use for loops all the time, and you probably
# will/should too, but the basic arguments against them are speed and clarity of
# code.

# One of the strengths of R is vectorization.
# For example, if I want to divide each value of a numeric vector by 2, I don't
# need a for loop that goes through each element of the vector, doing the
# calcuation as I go.

  v <- c()

  for(i in seq_along(data$Sepal.Length)){

    v[i] <- data$Sepal.Length[i] / 2

  }

  v

# I just do this:

  data$Sepal.Length / 2

# Apply functions are basically tools that take advatange of this vectorization
# to (sometimes) produce faster calcuations.
# They are also usually more consise to write. As the name suggests, they apply
# a function to each element of an object.

# The trick to apply functions is to know which type of object goes in and what
# comes out.

  ?apply

  m <- matrix(1:9, 3, 3)
  m
  apply(m, 2, mean)
  apply(m, 1, mean)

  apply(data[1:4], 2, mean) # Ok this works

  apply(data, 2, class) # The result here doesn't make sense. What happened?

  as.matrix(data)
# Apply works on arrays/matrices, which must all contain the same kind of data.
# The dataframe however includes a mixture of double and character data types.
# So apply converted your dataframe into a character matrix, so the class for
# each column is also character.

# For for dataframes, which are a special kind of list, we use lapply
# ("l"apply = list apply).

  x <- lapply(data, class)
  x
  class(x)
  table(x)
  table(unlist(x))

# The output of lapply is also a list, so using a function like table on it won't give good results.

# So instead you can use sapply. It's just like lappy, but it "S"implifies the
# output.

  x <- sapply(data, class)
  x
  class(x)
  table(x)

# I often use sapply to identify columns in a dataframe based on some
# characteristcs of the data.

# One way to subset a dataframe by columns is to index with a logical vector,
# where you keep the columns i that correspond to i = TRUE in a logical vector.

# For example, if I want the first 4 columns, but not the fifth, I could do this:

  data[c(TRUE, TRUE, TRUE, TRUE, FALSE)] %>% View()

# That's obviously very tedious. This is better:
  sapply(data, is.numeric)
  data[sapply(data, is.numeric)] %>% View()

# You can also set up conditional statements resulting in a logcal vector like this:

  data[sapply(data, class) == "numeric"]

# Exercise: Take the following dataframe, and consisely convert the c(1, 2)
# variables to a factor with the labels 1 = "No" and 2 = "Yes".

# It has 1000 variables, and those with data = c(1, 2) are randomly scattered
# throughout.

# Generate the data

  fake <- list()

  for(i in 1:1000){
    flip <- sample(c(1:4), 1)

    if(flip == 1){fake[[i]] <- rnorm(                 50)}
    if(flip == 2){fake[[i]] <- sample(c(1, 2),        50, replace = TRUE)}
    if(flip == 3){fake[[i]] <- sample(letters,        50, replace = TRUE)}
    if(flip == 4){fake[[i]] <- sample(c(TRUE, FALSE), 50, replace = TRUE)}

  }

  data <- do.call(cbind.data.frame, fake)

  names(data) <- paste0("V", c(1:length(data)))

# Here are the columns with min = 1, max = 2 and number of reponses = 2
  data[sapply(data, function(x) min(as.numeric(x))) == 1 &
       sapply(data, function(x) max(as.numeric(x))) == 2 &
       sapply(data, function(x) length(table(as.numeric(x)))) == 2] %>% View()

# How did this work? Break it down.

  sapply(data, function(x) min(as.numeric(x)))
  sapply(data, function(x) min(as.numeric(x))) == 1

  sapply(data, function(x) max(as.numeric(x)))
  sapply(data, function(x) max(as.numeric(x))) == 2

  sapply(data, function(x) length(table(as.numeric(x))))
  sapply(data, function(x) length(table(as.numeric(x)))) == 2

# Now replace those data with the factors

# Create the logical vector

  these <- sapply(data, function(x) min(as.numeric(x))) == 1 &
    sapply(data, function(x) max(as.numeric(x))) == 2 &
    sapply(data, function(x) length(table(as.numeric(x)))) == 2

# Use lapply to turn those into factors

  data[these] <- lapply(data[these],
                        factor,
                        levels = c(1, 2), labels = c("No", "Yes"))

  View(data)

  y <- rnorm(100, 3, 4)

  normalize <- function(x, ...){
    m <- mean(x, na.rm = TRUE)
    x <- (x - m) / sd(x, na.rm = TRUE )
    return(x)
  }

  normalize(y)


# Regular expressions ####

  x <- sample(c("    test  ", "test"), 1000, replace = TRUE)
  table(x)
  x <- gsub("^\\s+|\\s+$", "", x) # trailing/leading space

  x <- sample(c("??test", "!>test", "test&*"), 1000, replace = TRUE)
  x <- gsub("[[:punct:]]", "", x)
  x <- gsub(" ", ".", x)

  x <- gsub("^[[:digit:]]", "", x)

  x <- gsub(" ", ".", x)
  x <- gsub("\\/", ".", x)
  x <- gsub("\\,", "", x)
  x <- gsub("\\?", "", x)

  as.character(gsub("^\\s+|\\s+$", "", x)) # lead, trailing white space
  x <- make.names(x, unique = TRUE)

  x <- c("9as", "0bn")

  sub("^([0-9])(.+)", "\\2\\1", x) # Move digit from front to end

  x <- c("bob45", "bob56", "sarah67")
  x[grepl("bob",  x)]    <- "John"
  x

# This captures a number of any length \\d+ in the () as \\1, and then puts . in
# front what was captured.
  x <- sub("(\\d+)", "\\.\\1", x)

# Remove . at end of string
  x <- gsub("\\.$", "", x)

# keep only the digit from a string
  gregexpr("[[:digit:]]+", data$x) %>%
    regmatches(data$x, .) %>%
    unlist() %>%
    as.numeric()

# select values that match string in column
  data$x[grepl("x",  data$x)]    <- "X"

# Keep only the first digit

  data$x <- sub("([0-9]{1}).*", "\\1", data$x)

# Keep a match
# Keep everything after a given character

  data$x <-  regmatches(data$x, regexpr("([^XXX]*$)", data$x))
	# Apply functions ####

	# There are cases where you want to "do something" to each element in a given
	# data structure. For example, we might want to calcuate the
	# mean for each variable (column) in a dataframe.

	# Looping is a common way to do this.

	# Data

	# library(tidyverse)

	data <- iris

	View(data)

	# Print the mean for the first 4 columns of data

	for (i in 1:4){

	print(mean(data[[i]], na.rm = TRUE))

	}

	# Or something slightly more complicated
	# Create a new dataframe made up of the standardized values for first 4 columns
	# of data.

	results <- list() # Create a "blank" list

	for (i in 1:4){

	m <- mean(data[[i]], na.rm = TRUE)
	sd <- sd( data[[i]], na.rm = TRUE)

	results[[i]] <- (data[[i]] - m) / sd # Put results in the list

	}

	results <- do.call(cbind.data.frame, results) # Convert the list to dataframe

	library(dplyr)
	library(tidyr)

	# Plot the original data
	gather(data[, 1:4], var, value) %>%
	ggplot(aes(x = value, color = var, fill = var)) +
	geom_density() +
	facet_wrap(~var)

	# Plot the standardized values
	gather(results, var, value) %>%
	ggplot(aes(x = value, color = var, fill = var)) +
	geom_density() +
	facet_wrap(~var)

	# The argument against loops - Just google "Why shouldn't I use for loops r"
	# for a deluge of reasons. I use for loops all the time, and you probably
	# will/should too, but the basic arguments against them are speed and clarity of
	# code.

	# One of the strengths of R is vectorization.
	# For example, if I want to divide each value of a numeric vector by 2, I don't
	# need a for loop that goes through each element of the vector, doing the
	# calcuation as I go.

	v <- c()

	for(i in seq_along(data$Sepal.Length)){

	v[i] <- data$Sepal.Length[i] / 2

	}

	v

	# I just do this:

	data$Sepal.Length / 2

	# Apply functions are basically tools that take advatange of this vectorization
	# to (sometimes) produce faster calcuations.
	# They are also usually more consise to write. As the name suggests, they apply
	# a function to each element of an object.

	# The trick to apply functions is to know which type of object goes in and what
	# comes out.

	?apply

	m <- matrix(1:9, 3, 3)
	m
	apply(m, 2, mean)
	apply(m, 1, mean)

	apply(data[1:4], 2, mean) # Ok this works

	apply(data, 2, class) # The result here doesn't make sense. What happened?

	as.matrix(data)
	# Apply works on arrays/matrices, which must all contain the same kind of data.
	# The dataframe however includes a mixture of double and character data types.
	# So apply converted your dataframe into a character matrix, so the class for
	# each column is also character.

	# For for dataframes, which are a special kind of list, we use lapply
	# ("l"apply = list apply).

	x <- lapply(data, class)
	x
	class(x)
	table(x)
	table(unlist(x))

	# The output of lapply is also a list, so using a function like table on it won't give good results.

	# So instead you can use sapply. It's just like lappy, but it "S"implifies the
	# output.

	x <- sapply(data, class)
	x
	class(x)
	table(x)

	# I often use sapply to identify columns in a dataframe based on some
	# characteristcs of the data.

	# One way to subset a dataframe by columns is to index with a logical vector,
	# where you keep the columns i that correspond to i = TRUE in a logical vector.

	# For example, if I want the first 4 columns, but not the fifth, I could do this:

	data[c(TRUE, TRUE, TRUE, TRUE, FALSE)] %>% View()

	# That's obviously very tedious. This is better:
	sapply(data, is.numeric)
	data[sapply(data, is.numeric)] %>% View()

	# You can also set up conditional statements resulting in a logcal vector like this:

	data[sapply(data, class) == "numeric"]

	# Exercise: Take the following dataframe, and consisely convert the c(1, 2)
	# variables to a factor with the labels 1 = "No" and 2 = "Yes".

	# It has 1000 variables, and those with data = c(1, 2) are randomly scattered
	# throughout.

	# Generate the data

	fake <- list()

	for(i in 1:1000){
	flip <- sample(c(1:4), 1)

	if(flip == 1){fake[[i]] <- rnorm( 50)}
	if(flip == 2){fake[[i]] <- sample(c(1, 2), 50, replace = TRUE)}
	if(flip == 3){fake[[i]] <- sample(letters, 50, replace = TRUE)}
	if(flip == 4){fake[[i]] <- sample(c(TRUE, FALSE), 50, replace = TRUE)}

	}

	data <- do.call(cbind.data.frame, fake)

	names(data) <- paste0("V", c(1:length(data)))

	# Here are the columns with min = 1, max = 2 and number of reponses = 2
	data[sapply(data, function(x) min(as.numeric(x))) == 1 &
	sapply(data, function(x) max(as.numeric(x))) == 2 &
	sapply(data, function(x) length(table(as.numeric(x)))) == 2] %>% View()

	# How did this work? Break it down.

	sapply(data, function(x) min(as.numeric(x)))
	sapply(data, function(x) min(as.numeric(x))) == 1

	sapply(data, function(x) max(as.numeric(x)))
	sapply(data, function(x) max(as.numeric(x))) == 2

	sapply(data, function(x) length(table(as.numeric(x))))
	sapply(data, function(x) length(table(as.numeric(x)))) == 2

	# Now replace those data with the factors

	# Create the logical vector

	these <- sapply(data, function(x) min(as.numeric(x))) == 1 &
	sapply(data, function(x) max(as.numeric(x))) == 2 &
	sapply(data, function(x) length(table(as.numeric(x)))) == 2

	# Use lapply to turn those into factors

	data[these] <- lapply(data[these],
	factor,
	levels = c(1, 2), labels = c("No", "Yes"))

	View(data)

	y <- rnorm(100, 3, 4)

	normalize <- function(x, ...){
	m <- mean(x, na.rm = TRUE)
	x <- (x - m) / sd(x, na.rm = TRUE )
	return(x)
	}

	normalize(y)




	# Regular expressions ####

	x <- sample(c(" test ", "test"), 1000, replace = TRUE)
	table(x)
	x <- gsub("^\\s+\|\\s+$", "", x) # trailing/leading space

	x <- sample(c("??test", "!>test", "test&*"), 1000, replace = TRUE)
	x <- gsub("[[:punct:]]", "", x)
	x <- gsub(" ", ".", x)

	x <- gsub("^[[:digit:]]", "", x)

	x <- gsub(" ", ".", x)
	x <- gsub("\\/", ".", x)
	x <- gsub("\\,", "", x)
	x <- gsub("\\?", "", x)

	as.character(gsub("^\\s+\|\\s+$", "", x)) # lead, trailing white space
	x <- make.names(x, unique = TRUE)

	x <- c("9as", "0bn")

	sub("^([0-9])(.+)", "\\2\\1", x) # Move digit from front to end

	x <- c("bob45", "bob56", "sarah67")
	x[grepl("bob", x)] <- "John"
	x

	# This captures a number of any length \\d+ in the () as \\1, and then puts . in
	# front what was captured.
	x <- sub("(\\d+)", "\\.\\1", x)

	# Remove . at end of string
	x <- gsub("\\.$", "", x)

	# keep only the digit from a string
	gregexpr("[[:digit:]]+", data$x) %>%
	regmatches(data$x, .) %>%
	unlist() %>%
	as.numeric()

	# select values that match string in column
	data$x[grepl("x", data$x)] <- "X"

	# Keep only the first digit

	data$x <- sub("([0-9]{1}).*", "\\1", data$x)

	# Keep a match
	# Keep everything after a given character

	data$x <- regmatches(data$x, regexpr("([^XXX]*$)", data$x))