Last active
October 22, 2018 09:57
-
-
Save DomBennett/c5cc77c448f0b9bc5bbea85d2999dcae to your computer and use it in GitHub Desktop.
Software club: Intro to the tidyverse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://genomicsclass.github.io/book/pages/dplyr_tutorial.html | |
# Libs ---- | |
library(dplyr) | |
# Data ---- | |
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv" | |
filename <- "msleep_ggplot2.csv" | |
if (!file.exists(filename)) downloader::download(url,filename) | |
# forget head! Just make use it is a "tibble" | |
msleep <- as_tibble(read.csv("msleep_ggplot2.csv")) | |
(msleep) | |
# Select ---- | |
select(msleep, name, sleep_total) | |
select(msleep, starts_with("sl")) | |
# ends_with() = Select columns that end with a character string | |
# contains() = Select columns that contain a character string | |
# matches() = Select columns that match a regular expression | |
# one_of() = Select columns names that are from a group of names | |
# Filter ---- | |
filter(msleep, sleep_total >= 16) | |
filter(msleep, sleep_total >= 16, bodywt >= 1) | |
filter(msleep, order %in% c("Perissodactyla", "Primates")) | |
# Arrange ---- | |
arrange(msleep, order) | |
# Mutate ---- | |
mutate(msleep, rem_proportion = sleep_rem / sleep_total) | |
mutate(msleep, rem_proportion = sleep_rem / sleep_total, | |
bodywt_grams = bodywt * 1000) | |
# Summarise ---- | |
summarise(msleep, avg_sleep = mean(sleep_total), | |
min_sleep = min(sleep_total), | |
max_sleep = max(sleep_total), | |
total = n()) | |
# Group_by and intro to pipe ---- | |
group_by(msleep, order) %>% | |
summarise(avg_sleep = mean(sleep_total), | |
min_sleep = min(sleep_total), | |
max_sleep = max(sleep_total), | |
total = n()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# copied from https://jennybc.github.io/purrr-tutorial/ls01_map-name-position-shortcuts.html | |
# Libs | |
library(purrr) | |
library(repurrrsive) | |
# Recap | |
# for loops are slow! | |
# base R has in-built vectorisation methods, apply, tapply, sapply... | |
# but these can be a bit piggy to use | |
# purrr is a new (yes another new approach) for vectorisation | |
# The data | |
is(got_chars) | |
length(got_chars) | |
got_chars[[1]] | |
# The map function | |
# map(YOUR_LIST, YOUR_FUNCTION) | |
map(got_chars[1:4], length) | |
map(got_chars[1:4], function(x) x[['name']]) | |
# shortcut: function(x) x[["TEXT"]] | |
map(got_chars[1:4], "name") | |
# shortcut: function(x) x[[i]] | |
map(got_chars[1:4], 3) | |
# For loop equivalent | |
res <- rep(NA, 4) | |
for (i in 1:4) { | |
res[i] <- got_chars[[i]][['name']] | |
} | |
# Combining with pipe | |
map_int(got_chars[1:4], function(x) nchar(x[['name']])) | |
map(got_chars[1:4], 'name') %>% nchar | |
# Example with trees and intro to map2! | |
library(ape) | |
trees <- map(1:100, function(x) compute.brlen(rtree(100))) | |
spp <- map(.x = trees, ~ sample(x = .x$tip.label, size = 10)) | |
pds <- map2(.x = trees, .y = spp, .f = ~ drop.tip(.x, tip = .x$tip.label[!.x$tip.label %in% .y])) %>% | |
map_dbl(~ sum(.x$edge.length)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# based on: https://blog.rstudio.com/2014/07/22/introducing-tidyr/ | |
library(tidyr) | |
# 1. separate what is observed from what varies per observation | |
# gen messy data | |
messy <- data.frame( | |
name = c("Wilbur", "Petunia", "Gregory"), | |
a = c(67, 80, 64), | |
b = c(56, 90, 50) | |
) | |
# messy because observations are split over column and row | |
(messy) | |
# we can use the function gather to tidy the data | |
(gather(data = messy, key = drug, value = heartrate, a:b)) | |
# now the observations (person and drug) is separated from the variables (heartrate) | |
# 2. more complex scenario | |
set.seed(10) | |
messy <- data.frame( | |
id = 1:4, | |
trt = sample(rep(c('control', 'treatment'), each = 2)), | |
work.T1 = runif(4), | |
home.T1 = runif(4), | |
work.T2 = runif(4), | |
home.T2 = runif(4) | |
) | |
# this is messy data because the timings (the actual values of interest) | |
# are spread across multiple columns | |
(messy) | |
# we can reformat this database by creating a new time column | |
(tidier <- gather(data = messy, key = key, value = time, -id, -trt)) | |
# functionally equivalent to .... | |
(tidier <- gather(data = messy, key = key, value = time, work.T1, home.T1, | |
work.T2, home.T2)) | |
(separate(data = tidier, col = key, into = c("location", "time"), sep = "\\.")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment