Skip to content

Instantly share code, notes, and snippets.

@dantalus
Created June 9, 2017 14:36
Show Gist options
  • Save dantalus/3aa72448c0678be9adb277330ccc1e94 to your computer and use it in GitHub Desktop.
Save dantalus/3aa72448c0678be9adb277330ccc1e94 to your computer and use it in GitHub Desktop.
# Apply functions ####
# There are cases where you want to "do something" to each element in a given
# data structure. For example, we might want to calcuate the
# mean for each variable (column) in a dataframe.
# Looping is a common way to do this.
# Data
# library(tidyverse)
data <- iris
View(data)
# Print the mean for the first 4 columns of data
for (i in 1:4){
print(mean(data[[i]], na.rm = TRUE))
}
# Or something slightly more complicated
# Create a new dataframe made up of the standardized values for first 4 columns
# of data.
results <- list() # Create a "blank" list
for (i in 1:4){
m <- mean(data[[i]], na.rm = TRUE)
sd <- sd( data[[i]], na.rm = TRUE)
results[[i]] <- (data[[i]] - m) / sd # Put results in the list
}
results <- do.call(cbind.data.frame, results) # Convert the list to dataframe
library(dplyr)
library(tidyr)
# Plot the original data
gather(data[, 1:4], var, value) %>%
ggplot(aes(x = value, color = var, fill = var)) +
geom_density() +
facet_wrap(~var)
# Plot the standardized values
gather(results, var, value) %>%
ggplot(aes(x = value, color = var, fill = var)) +
geom_density() +
facet_wrap(~var)
# The argument against loops - Just google "Why shouldn't I use for loops r"
# for a deluge of reasons. I use for loops all the time, and you probably
# will/should too, but the basic arguments against them are speed and clarity of
# code.
# One of the strengths of R is vectorization.
# For example, if I want to divide each value of a numeric vector by 2, I don't
# need a for loop that goes through each element of the vector, doing the
# calcuation as I go.
v <- c()
for(i in seq_along(data$Sepal.Length)){
v[i] <- data$Sepal.Length[i] / 2
}
v
# I just do this:
data$Sepal.Length / 2
# Apply functions are basically tools that take advatange of this vectorization
# to (sometimes) produce faster calcuations.
# They are also usually more consise to write. As the name suggests, they apply
# a function to each element of an object.
# The trick to apply functions is to know which type of object goes in and what
# comes out.
?apply
m <- matrix(1:9, 3, 3)
m
apply(m, 2, mean)
apply(m, 1, mean)
apply(data[1:4], 2, mean) # Ok this works
apply(data, 2, class) # The result here doesn't make sense. What happened?
as.matrix(data)
# Apply works on arrays/matrices, which must all contain the same kind of data.
# The dataframe however includes a mixture of double and character data types.
# So apply converted your dataframe into a character matrix, so the class for
# each column is also character.
# For for dataframes, which are a special kind of list, we use lapply
# ("l"apply = list apply).
x <- lapply(data, class)
x
class(x)
table(x)
table(unlist(x))
# The output of lapply is also a list, so using a function like table on it won't give good results.
# So instead you can use sapply. It's just like lappy, but it "S"implifies the
# output.
x <- sapply(data, class)
x
class(x)
table(x)
# I often use sapply to identify columns in a dataframe based on some
# characteristcs of the data.
# One way to subset a dataframe by columns is to index with a logical vector,
# where you keep the columns i that correspond to i = TRUE in a logical vector.
# For example, if I want the first 4 columns, but not the fifth, I could do this:
data[c(TRUE, TRUE, TRUE, TRUE, FALSE)] %>% View()
# That's obviously very tedious. This is better:
sapply(data, is.numeric)
data[sapply(data, is.numeric)] %>% View()
# You can also set up conditional statements resulting in a logcal vector like this:
data[sapply(data, class) == "numeric"]
# Exercise: Take the following dataframe, and consisely convert the c(1, 2)
# variables to a factor with the labels 1 = "No" and 2 = "Yes".
# It has 1000 variables, and those with data = c(1, 2) are randomly scattered
# throughout.
# Generate the data
fake <- list()
for(i in 1:1000){
flip <- sample(c(1:4), 1)
if(flip == 1){fake[[i]] <- rnorm( 50)}
if(flip == 2){fake[[i]] <- sample(c(1, 2), 50, replace = TRUE)}
if(flip == 3){fake[[i]] <- sample(letters, 50, replace = TRUE)}
if(flip == 4){fake[[i]] <- sample(c(TRUE, FALSE), 50, replace = TRUE)}
}
data <- do.call(cbind.data.frame, fake)
names(data) <- paste0("V", c(1:length(data)))
# Here are the columns with min = 1, max = 2 and number of reponses = 2
data[sapply(data, function(x) min(as.numeric(x))) == 1 &
sapply(data, function(x) max(as.numeric(x))) == 2 &
sapply(data, function(x) length(table(as.numeric(x)))) == 2] %>% View()
# How did this work? Break it down.
sapply(data, function(x) min(as.numeric(x)))
sapply(data, function(x) min(as.numeric(x))) == 1
sapply(data, function(x) max(as.numeric(x)))
sapply(data, function(x) max(as.numeric(x))) == 2
sapply(data, function(x) length(table(as.numeric(x))))
sapply(data, function(x) length(table(as.numeric(x)))) == 2
# Now replace those data with the factors
# Create the logical vector
these <- sapply(data, function(x) min(as.numeric(x))) == 1 &
sapply(data, function(x) max(as.numeric(x))) == 2 &
sapply(data, function(x) length(table(as.numeric(x)))) == 2
# Use lapply to turn those into factors
data[these] <- lapply(data[these],
factor,
levels = c(1, 2), labels = c("No", "Yes"))
View(data)
y <- rnorm(100, 3, 4)
normalize <- function(x, ...){
m <- mean(x, na.rm = TRUE)
x <- (x - m) / sd(x, na.rm = TRUE )
return(x)
}
normalize(y)
# Regular expressions ####
x <- sample(c(" test ", "test"), 1000, replace = TRUE)
table(x)
x <- gsub("^\\s+|\\s+$", "", x) # trailing/leading space
x <- sample(c("??test", "!>test", "test&*"), 1000, replace = TRUE)
x <- gsub("[[:punct:]]", "", x)
x <- gsub(" ", ".", x)
x <- gsub("^[[:digit:]]", "", x)
x <- gsub(" ", ".", x)
x <- gsub("\\/", ".", x)
x <- gsub("\\,", "", x)
x <- gsub("\\?", "", x)
as.character(gsub("^\\s+|\\s+$", "", x)) # lead, trailing white space
x <- make.names(x, unique = TRUE)
x <- c("9as", "0bn")
sub("^([0-9])(.+)", "\\2\\1", x) # Move digit from front to end
x <- c("bob45", "bob56", "sarah67")
x[grepl("bob", x)] <- "John"
x
# This captures a number of any length \\d+ in the () as \\1, and then puts . in
# front what was captured.
x <- sub("(\\d+)", "\\.\\1", x)
# Remove . at end of string
x <- gsub("\\.$", "", x)
# keep only the digit from a string
gregexpr("[[:digit:]]+", data$x) %>%
regmatches(data$x, .) %>%
unlist() %>%
as.numeric()
# select values that match string in column
data$x[grepl("x", data$x)] <- "X"
# Keep only the first digit
data$x <- sub("([0-9]{1}).*", "\\1", data$x)
# Keep a match
# Keep everything after a given character
data$x <- regmatches(data$x, regexpr("([^XXX]*$)", data$x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment