DomBennett/dplyr.R

## dplyr.R
# http://genomicsclass.github.io/book/pages/dplyr_tutorial.html
# Libs ----
library(dplyr)

# Data ----
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv"
filename <- "msleep_ggplot2.csv"
if (!file.exists(filename)) downloader::download(url,filename)
# forget head! Just make use it is a "tibble"
msleep <- as_tibble(read.csv("msleep_ggplot2.csv"))
(msleep)

# Select ----
select(msleep, name, sleep_total)
select(msleep, starts_with("sl"))
# ends_with() = Select columns that end with a character string
# contains() = Select columns that contain a character string
# matches() = Select columns that match a regular expression
# one_of() = Select columns names that are from a group of names

# Filter ----
filter(msleep, sleep_total >= 16)
filter(msleep, sleep_total >= 16, bodywt >= 1)
filter(msleep, order %in% c("Perissodactyla", "Primates"))

# Arrange ----
arrange(msleep, order)

# Mutate ----
mutate(msleep, rem_proportion = sleep_rem / sleep_total)
mutate(msleep, rem_proportion = sleep_rem / sleep_total,
       bodywt_grams = bodywt * 1000)

# Summarise ----
summarise(msleep, avg_sleep = mean(sleep_total),
          min_sleep = min(sleep_total),
          max_sleep = max(sleep_total),
          total = n())

# Group_by and intro to pipe ----
group_by(msleep, order) %>%
  summarise(avg_sleep = mean(sleep_total),
            min_sleep = min(sleep_total),
            max_sleep = max(sleep_total),
            total = n())

## purrr.R
# copied from https://jennybc.github.io/purrr-tutorial/ls01_map-name-position-shortcuts.html
# Libs
library(purrr)
library(repurrrsive)

# Recap
# for loops are slow!
# base R has in-built vectorisation methods, apply, tapply, sapply...
# but these can be a bit piggy to use
# purrr is a new (yes another new approach) for vectorisation

# The data
is(got_chars)
length(got_chars)
got_chars[[1]]

# The map function
# map(YOUR_LIST, YOUR_FUNCTION)
map(got_chars[1:4], length)
map(got_chars[1:4], function(x) x[['name']])
# shortcut: function(x) x[["TEXT"]]
map(got_chars[1:4], "name")
# shortcut: function(x) x[[i]]
map(got_chars[1:4], 3)

# For loop equivalent
res <- rep(NA, 4)
for (i in 1:4) {
  res[i] <- got_chars[[i]][['name']]
}

# Combining with pipe
map_int(got_chars[1:4], function(x) nchar(x[['name']]))
map(got_chars[1:4], 'name') %>% nchar

# Example with trees and intro to map2!
library(ape)
trees <- map(1:100, function(x) compute.brlen(rtree(100)))
spp <- map(.x = trees, ~ sample(x = .x$tip.label, size = 10))
pds <- map2(.x = trees, .y = spp, .f = ~ drop.tip(.x, tip = .x$tip.label[!.x$tip.label %in% .y])) %>%
  map_dbl(~ sum(.x$edge.length))

## tidyr.R
# based on: https://blog.rstudio.com/2014/07/22/introducing-tidyr/
library(tidyr)

# 1.  separate what is observed from what varies per observation
# gen messy data
messy <- data.frame(
  name = c("Wilbur", "Petunia", "Gregory"),
  a = c(67, 80, 64),
  b = c(56, 90, 50)
)
# messy because observations are split over column and row
(messy)
# we can use the function gather to tidy the data
(gather(data = messy, key = drug, value = heartrate, a:b))
# now the observations (person and drug) is separated from the variables (heartrate)

# 2. more complex scenario
set.seed(10)
messy <- data.frame(
  id = 1:4,
  trt = sample(rep(c('control', 'treatment'), each = 2)),
  work.T1 = runif(4),
  home.T1 = runif(4),
  work.T2 = runif(4),
  home.T2 = runif(4)
)
# this is messy data because the timings (the actual values of interest)
# are spread across multiple columns
(messy)
# we can reformat this database by creating a new time column
(tidier <- gather(data = messy, key = key, value = time, -id, -trt))
# functionally equivalent to ....
(tidier <- gather(data = messy, key = key, value = time, work.T1, home.T1,
                  work.T2, home.T2))
(separate(data = tidier, col = key, into = c("location", "time"), sep = "\\."))
	# http://genomicsclass.github.io/book/pages/dplyr_tutorial.html
	# Libs ----
	library(dplyr)

	# Data ----
	url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv"
	filename <- "msleep_ggplot2.csv"
	if (!file.exists(filename)) downloader::download(url,filename)
	# forget head! Just make use it is a "tibble"
	msleep <- as_tibble(read.csv("msleep_ggplot2.csv"))
	(msleep)

	# Select ----
	select(msleep, name, sleep_total)
	select(msleep, starts_with("sl"))
	# ends_with() = Select columns that end with a character string
	# contains() = Select columns that contain a character string
	# matches() = Select columns that match a regular expression
	# one_of() = Select columns names that are from a group of names

	# Filter ----
	filter(msleep, sleep_total >= 16)
	filter(msleep, sleep_total >= 16, bodywt >= 1)
	filter(msleep, order %in% c("Perissodactyla", "Primates"))

	# Arrange ----
	arrange(msleep, order)

	# Mutate ----
	mutate(msleep, rem_proportion = sleep_rem / sleep_total)
	mutate(msleep, rem_proportion = sleep_rem / sleep_total,
	bodywt_grams = bodywt * 1000)

	# Summarise ----
	summarise(msleep, avg_sleep = mean(sleep_total),
	min_sleep = min(sleep_total),
	max_sleep = max(sleep_total),
	total = n())

	# Group_by and intro to pipe ----
	group_by(msleep, order) %>%
	summarise(avg_sleep = mean(sleep_total),
	min_sleep = min(sleep_total),
	max_sleep = max(sleep_total),
	total = n())
	# copied from https://jennybc.github.io/purrr-tutorial/ls01_map-name-position-shortcuts.html
	# Libs
	library(purrr)
	library(repurrrsive)

	# Recap
	# for loops are slow!
	# base R has in-built vectorisation methods, apply, tapply, sapply...
	# but these can be a bit piggy to use
	# purrr is a new (yes another new approach) for vectorisation

	# The data
	is(got_chars)
	length(got_chars)
	got_chars[[1]]

	# The map function
	# map(YOUR_LIST, YOUR_FUNCTION)
	map(got_chars[1:4], length)
	map(got_chars[1:4], function(x) x[['name']])
	# shortcut: function(x) x[["TEXT"]]
	map(got_chars[1:4], "name")
	# shortcut: function(x) x[[i]]
	map(got_chars[1:4], 3)

	# For loop equivalent
	res <- rep(NA, 4)
	for (i in 1:4) {
	res[i] <- got_chars[[i]][['name']]
	}

	# Combining with pipe
	map_int(got_chars[1:4], function(x) nchar(x[['name']]))
	map(got_chars[1:4], 'name') %>% nchar

	# Example with trees and intro to map2!
	library(ape)
	trees <- map(1:100, function(x) compute.brlen(rtree(100)))
	spp <- map(.x = trees, ~ sample(x = .x$tip.label, size = 10))
	pds <- map2(.x = trees, .y = spp, .f = ~ drop.tip(.x, tip = .x$tip.label[!.x$tip.label %in% .y])) %>%
	map_dbl(~ sum(.x$edge.length))
	# based on: https://blog.rstudio.com/2014/07/22/introducing-tidyr/
	library(tidyr)

	# 1. separate what is observed from what varies per observation
	# gen messy data
	messy <- data.frame(
	name = c("Wilbur", "Petunia", "Gregory"),
	a = c(67, 80, 64),
	b = c(56, 90, 50)
	)
	# messy because observations are split over column and row
	(messy)
	# we can use the function gather to tidy the data
	(gather(data = messy, key = drug, value = heartrate, a:b))
	# now the observations (person and drug) is separated from the variables (heartrate)

	# 2. more complex scenario
	set.seed(10)
	messy <- data.frame(
	id = 1:4,
	trt = sample(rep(c('control', 'treatment'), each = 2)),
	work.T1 = runif(4),
	home.T1 = runif(4),
	work.T2 = runif(4),
	home.T2 = runif(4)
	)
	# this is messy data because the timings (the actual values of interest)
	# are spread across multiple columns
	(messy)
	# we can reformat this database by creating a new time column
	(tidier <- gather(data = messy, key = key, value = time, -id, -trt))
	# functionally equivalent to ....
	(tidier <- gather(data = messy, key = key, value = time, work.T1, home.T1,
	work.T2, home.T2))
	(separate(data = tidier, col = key, into = c("location", "time"), sep = "\\."))