Skip to content

Instantly share code, notes, and snippets.

@DomBennett
Last active October 22, 2018 09:57
Show Gist options
  • Save DomBennett/c5cc77c448f0b9bc5bbea85d2999dcae to your computer and use it in GitHub Desktop.
Save DomBennett/c5cc77c448f0b9bc5bbea85d2999dcae to your computer and use it in GitHub Desktop.
Software club: Intro to the tidyverse
# http://genomicsclass.github.io/book/pages/dplyr_tutorial.html
# Libs ----
library(dplyr)
# Data ----
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv"
filename <- "msleep_ggplot2.csv"
if (!file.exists(filename)) downloader::download(url,filename)
# forget head! Just make use it is a "tibble"
msleep <- as_tibble(read.csv("msleep_ggplot2.csv"))
(msleep)
# Select ----
select(msleep, name, sleep_total)
select(msleep, starts_with("sl"))
# ends_with() = Select columns that end with a character string
# contains() = Select columns that contain a character string
# matches() = Select columns that match a regular expression
# one_of() = Select columns names that are from a group of names
# Filter ----
filter(msleep, sleep_total >= 16)
filter(msleep, sleep_total >= 16, bodywt >= 1)
filter(msleep, order %in% c("Perissodactyla", "Primates"))
# Arrange ----
arrange(msleep, order)
# Mutate ----
mutate(msleep, rem_proportion = sleep_rem / sleep_total)
mutate(msleep, rem_proportion = sleep_rem / sleep_total,
bodywt_grams = bodywt * 1000)
# Summarise ----
summarise(msleep, avg_sleep = mean(sleep_total),
min_sleep = min(sleep_total),
max_sleep = max(sleep_total),
total = n())
# Group_by and intro to pipe ----
group_by(msleep, order) %>%
summarise(avg_sleep = mean(sleep_total),
min_sleep = min(sleep_total),
max_sleep = max(sleep_total),
total = n())
# copied from https://jennybc.github.io/purrr-tutorial/ls01_map-name-position-shortcuts.html
# Libs
library(purrr)
library(repurrrsive)
# Recap
# for loops are slow!
# base R has in-built vectorisation methods, apply, tapply, sapply...
# but these can be a bit piggy to use
# purrr is a new (yes another new approach) for vectorisation
# The data
is(got_chars)
length(got_chars)
got_chars[[1]]
# The map function
# map(YOUR_LIST, YOUR_FUNCTION)
map(got_chars[1:4], length)
map(got_chars[1:4], function(x) x[['name']])
# shortcut: function(x) x[["TEXT"]]
map(got_chars[1:4], "name")
# shortcut: function(x) x[[i]]
map(got_chars[1:4], 3)
# For loop equivalent
res <- rep(NA, 4)
for (i in 1:4) {
res[i] <- got_chars[[i]][['name']]
}
# Combining with pipe
map_int(got_chars[1:4], function(x) nchar(x[['name']]))
map(got_chars[1:4], 'name') %>% nchar
# Example with trees and intro to map2!
library(ape)
trees <- map(1:100, function(x) compute.brlen(rtree(100)))
spp <- map(.x = trees, ~ sample(x = .x$tip.label, size = 10))
pds <- map2(.x = trees, .y = spp, .f = ~ drop.tip(.x, tip = .x$tip.label[!.x$tip.label %in% .y])) %>%
map_dbl(~ sum(.x$edge.length))
# based on: https://blog.rstudio.com/2014/07/22/introducing-tidyr/
library(tidyr)
# 1. separate what is observed from what varies per observation
# gen messy data
messy <- data.frame(
name = c("Wilbur", "Petunia", "Gregory"),
a = c(67, 80, 64),
b = c(56, 90, 50)
)
# messy because observations are split over column and row
(messy)
# we can use the function gather to tidy the data
(gather(data = messy, key = drug, value = heartrate, a:b))
# now the observations (person and drug) is separated from the variables (heartrate)
# 2. more complex scenario
set.seed(10)
messy <- data.frame(
id = 1:4,
trt = sample(rep(c('control', 'treatment'), each = 2)),
work.T1 = runif(4),
home.T1 = runif(4),
work.T2 = runif(4),
home.T2 = runif(4)
)
# this is messy data because the timings (the actual values of interest)
# are spread across multiple columns
(messy)
# we can reformat this database by creating a new time column
(tidier <- gather(data = messy, key = key, value = time, -id, -trt))
# functionally equivalent to ....
(tidier <- gather(data = messy, key = key, value = time, work.T1, home.T1,
work.T2, home.T2))
(separate(data = tidier, col = key, into = c("location", "time"), sep = "\\."))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment