Created
February 8, 2015 10:10
-
-
Save Emaasit/56e9dbb48bc7c4382478 to your computer and use it in GitHub Desktop.
data manipulation using dplyr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyr) | |
library(dplyr) | |
library(Lahman) | |
### Some extended examples re dplyr using the Lahman baseball database | |
### Problem 1: | |
### Get the season batting statistics for the 2013 Los Angeles Dodgers, | |
### including slugging percentage, on-base percentage and their sum OPS. | |
### Restrict to players with at least 50 official at bats and sort by | |
### descending OPS. | |
batting <- tbl_df(Batting) %>% | |
select(playerID, yearID, teamID, lgID, G, AB:SB, BB:SF) | |
player_names <- tbl_df(Master) %>% | |
select(playerID, nameFirst, nameLast, bats, throws) | |
LAD13 <- batting %>% | |
filter(yearID == 2013, teamID == "LAN", AB > 50) %>% | |
mutate(BA = round(H/AB, 3), | |
TB = H + X2B + 2 * X3B + 3 * HR, | |
SLG = round(TB/AB, 3), | |
OBP = round((H + BB + HBP)/(AB + BB + HBP), 3), | |
OPS = SLG + OBP) %>% | |
select(-TB) | |
LAD13btg <- inner_join(player_names, LAD13) %>% | |
select(-throws, -contains("ID")) %>% | |
arrange(desc(OPS)) | |
### Problem 2: | |
### Get the season pitching statistics for the 2012 Colorado Rockies, | |
### sorted by ERA. | |
pitching <- tbl_df(Pitching) %>% | |
select(playerID, yearID, teamID:GS, SV:SO, ERA:HBP) | |
COL12 <- pitching %>% | |
filter(yearID == 2012, teamID == "COL") | |
COL12ptch <- inner_join(player_names, COL12) %>% | |
select(-bats, -contains("ID")) %>% | |
arrange(ERA) | |
### Problem 3: | |
### Season attendance per game by team from 2006-2010, National League, | |
### sorted by team abbreviation | |
teams <- tbl_df(Teams) %>% | |
select(teamID, yearID, lgID, Ghome, attendance) | |
NLattend <- teams %>% | |
filter(yearID %in% 2006:2010, lgID == "NL") %>% | |
mutate(teamID = factor(teamID), | |
APG = round(attendance/Ghome)) %>% | |
select(-lgID, -Ghome, -attendance) | |
## Use the spread() function to show the table in 'wide' form | |
NLattend %>% spread(yearID, APG) | |
## Plot these over time | |
library(ggplot2) | |
ggplot(NLattend, aes(x = yearID, y = APG, color = teamID)) + | |
# theme_bw() + | |
geom_line(size = 1) + | |
scale_color_manual(values = rainbow(16)) + | |
theme(panel.background = element_rect(fill = "black"), | |
panel.grid.major = element_line(colour = "grey60"), | |
panel.grid.minor = element_line(colour = "grey50"), | |
plot.background = element_rect(fill = "grey90"), | |
legend.key = element_rect(fill = "black")) + | |
labs(x = "Year", y = "Average attendance per game", | |
color = "Team ID") | |
### | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Packages to load | |
# Install first if necessary | |
# Uncomment and edit as necessary | |
# install.packages(c("Lahman", "ggplot2", "plyr", "reshape2", | |
"data.table", "tidyr")) | |
# Download and install latest version of dplyr | |
# On Windows and Macs, you need to install from source at present | |
# On Windows, this means you need to have Rtools3.1 | |
# installed from CRAN before installing dplyr. Should be OK on MacOS. | |
# The installation takes a couple of minutes since C++ files | |
# must be compiled. | |
install.packages("dplyr", type = "source") | |
# Load packages | |
library(ggplot2) | |
library(Lahman) | |
library(data.table) | |
library(tidyr) | |
library(dplyr) | |
# Toy data frame for basic illustrations | |
DF <- data.frame(gp = gl(3, 4, labels = LETTERS[1:3]), | |
x = sample(seq(12)), | |
y = rnorm(12)) | |
## Examples of how to use the tbl_*() functions | |
DT <- data.table(DF, key = "gp") | |
dft <- tbl_df(DF) # data frame to dplyr tbl | |
dtt <- tbl_dt(DT) # data table to dplyr tbl | |
str(dft) | |
str(dtt) | |
rm(DT, dtt) # won't use data.table again | |
detach(package:data.table, unload = TRUE) | |
################### | |
## One-table verbs | |
################### | |
# arrange(): sorting | |
arrange(dft, gp, y) | |
dft %>% arrange(gp, y) # data pipeline version | |
dft %>% arrange(gp, desc(x)) # reverse sort by x w/i gp | |
# filter(): row subsetting | |
dft %>% filter(gp == "A") | |
dft %>% filter(gp %in% c("A", "B") & y > 0) | |
# count(): frequency tabulation | |
dft %>% count(gp) | |
dft %>% count(gp, y > 0) | |
# tally(): sums numeric vars | |
dft %>% group_by(gp) %>% tally(y) # sum of y by gp | |
# select(): selects columns to retain | |
dft %>% select(-x) # remove column named x | |
dft %>% select(starts_with("g")) # returns gp | |
### Better idea of how the function works: | |
example(dplyr::select) | |
# summarise(): one-number summar[y/ies] of a variable | |
dft %>% group_by(gp) %>% summarise(mean_y = mean(y), sd_y = sd(y), n = n()) | |
# mutate(): transforms or creates variables within a tbl object | |
dft %>% | |
group_by(gp) %>% | |
mutate(abs_y = abs(y), norm_y = scale(y, scale = TRUE), | |
z = x + 2 * y) | |
# transmute(): like mutate() except that it drops unused existing variables | |
# New in version 0.3 | |
dft %>% mutate(abs_y = abs(y), z = 1 + y/2) | |
dft %>% transmute(abs_y = abs(y), z = 1 + y/2) | |
### transmute() can also operate groupwise | |
dft %>% | |
group_by(gp) %>% | |
transmute(norm_y = scale(y, scale = TRUE)) | |
# slice(): row indexing operator in dplyr using vector positions | |
# rather than logical expressions | |
dft %>% slice(1:5) | |
dft %>% slice(c(2, 5, 7, 10)) | |
dft %>% group_by(gp) %>% slice(n()) | |
dft %>% group_by(gp) %>% slice(1) | |
# rename(): renames variables | |
rename(dft, xx = x, yy = y) | |
dft %>% rename(xx = x) | |
################## | |
# Two-table verbs | |
################## | |
t1 <- data.frame(name = c("Tom", "Rick", "Harriet", "Ralph", | |
"Noriko", "Tyrone", "Ingrid"), | |
position = c("VP", "AVP", "VP", "AVP", | |
"CFO", "CEO", "IT")) | |
t2 <- data.frame(name = c("Tom", "Harriet", "Ralph", "Noriko", | |
"Ingrid"), | |
company = c("A", "A", "B", "B", "A")) | |
T1 <- tbl_df(t1) | |
T2 <- tbl_df(t2) | |
rm(t1, t2) | |
inner_join(T1, T2) | |
left_join(T1, T2) | |
left_join(T2, T1) # right join | |
semi_join(T1, T2) | |
anti_join(T1, T2) | |
################# | |
# do() function | |
################# | |
# Used to apply a non-verb function groupwise to a tbl object | |
# Particularly useful for model-fitting functions | |
teams <- tbl_df(Lahman::Teams) | |
teams1013 <- teams %>% | |
filter(yearID >= 2010) %>% | |
select(yearID, H, R, lgID) | |
# Reset the levels of lgID | |
teams1013 <- teams1013 %>% mutate(lgID = factor(lgID)) | |
# Model runs scored vs. hits by team within season | |
mod1 <- teams1013 %>% | |
group_by(yearID, lgID) %>% | |
do(mod = lm(R ~ H, data = .)) | |
# Some useful things to extract from this object | |
class(mod1) | |
length(mod1) | |
sapply(mod1, class) | |
mod1 | |
# Utility functions to extract R^2 and the model coefficients | |
r2 <- function(m) summary(m)$r.squared | |
coef_df <- function(m) | |
{ | |
sc <- coef(m) | |
names(sc) <- c("Intercept", "Slope") | |
data.frame(as.list(sc)) | |
} | |
## In this case, do() is working row-wise on mod1 because the | |
## mod component is a list, which is why we use [1] for the first | |
## two variables | |
mod1 %>% do(data.frame(year = .$yearID[1], league = .$lgID[1], | |
coef_df(.$mod), rsq = r2(.$mod))) | |
## Another approach, appending the results of do() to the first | |
## two columns of mod1 | |
data.frame(mod1[, 1:2], | |
mod1 %>% do(data.frame(coef_df(.$mod), | |
rsq = r2(.$mod)))) | |
# Alternative approach: all in one | |
summfun <- function(d) | |
{ | |
rsq <- do.call(c, lapply(d$mod, r2)) | |
coeff <- do.call(rbind, lapply(d$mod, coef)) | |
dd <- data.frame(d$yearID, d$lgID, coeff, rsq) | |
names(dd) <- c("year", "league", "intercept", "slope", "rsq") | |
dd | |
} | |
summfun(mod1) | |
############## | |
## tidyr | |
############## | |
# parallels dplyr as reshape2 parallels plyr | |
### gather() | |
### stacks multiple columns into two | |
m1 <- matrix(rnorm(20), ncol = 5, | |
dimnames = list(NULL, paste0("x", 1:5))) | |
m2 <- matrix(rnorm(20), ncol = 5, | |
dimnames = list(NULL, paste0("y", 1:5))) | |
DF1 <- data.frame(g = gl(2, 2, labels = LETTERS[1:2]), | |
m1, m2) | |
names(DF1) | |
dft1 <- tbl_df(DF1) | |
dft2 <- dft1 %>% | |
gather(xvar, xvalue, x1:x5) %>% | |
gather(yvar, yvalue, y1:y5) | |
str(dft2) | |
### spread() is the inverse function of gather(): | |
### unstacks two columns into several | |
dft2 %>% spread(xvar, xvalue) | |
### separate() splits a variable name into two new ones | |
DF2 <- data.frame(smin = c(2, 4, 3), smax = c(3, 6, 7), | |
tmin = c(3, 0, 4), tmax = c(5, 1, 8)) | |
DFT2 <- tbl_df(DF2) | |
( DFT3 <- DFT2 %>% gather(var, value, smin:tmax) %>% | |
separate(var, c("type", "stat"), 1) ) | |
### unite() is the inverse function of separate | |
DFT3 %>% unite(type_stat, type, stat) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment