Skip to content

Instantly share code, notes, and snippets.

@sbalci
Forked from hadley/dplyr-summarise.R
Created February 23, 2020 06:32
Show Gist options
  • Save sbalci/854008972907fcc145068691a504066b to your computer and use it in GitHub Desktop.
Save sbalci/854008972907fcc145068691a504066b to your computer and use it in GitHub Desktop.
# What's the most natural way to express this code in base R?
library(dplyr, warn.conflicts = FALSE)
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp), n = n())
#> # A tibble: 3 x 3
#> cyl mean n
#> <dbl> <dbl> <int>
#> 1 4 105. 11
#> 2 6 183. 7
#> 3 8 353. 14
# tapply() ----------------------------------------------------------------
data.frame(
cyl = sort(unique(mtcars$cyl)),
mean = tapply(mtcars$disp, mtcars$cyl, mean),
n = tapply(mtcars$disp, mtcars$cyl, length)
)
#> cyl mean n
#> 4 4 105.1364 11
#> 6 6 183.3143 7
#> 8 8 353.1000 14
# - hard to generalise to more than one group because tapply() will
# return an array
# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as
# the tapply() output?
# aggregate() -------------------------------------------------------------
df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
names(df_mean)[2] <- "mean"
names(df_length)[2] <- "n"
merge(df_mean, df_length, by = "cyl")
#> cyl mean n
#> 1 4 105.1364 11
#> 2 6 183.3143 7
#> 3 8 353.1000 14
# + generalises in stratightforward to multiple grouping variables and
# multiple summary variables
# - need to manually rename summary variables
# Could also use formula interface
# https://twitter.com/tjmahr/status/1231255000766005248
df_mean <- aggregate(disp ~ cyl, mtcars, mean)
df_length <- aggregate(disp ~ cyl, mtcars, length)
# by() --------------------------------------------------------------------
mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
})
do.call(rbind, mtcars_by)
#> cyl mean n
#> 4 4 105.1364 11
#> 6 6 183.3143 7
#> 8 8 353.1000 14
# + generalises easily to more/different summaries
# - need to know about anonymous functions + do.call + rbind
# by() = split() + lapply()
mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) {
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
})
do.call(rbind, mtcars_by)
#> cyl mean n
#> 4 4 105.1364 11
#> 6 6 183.3143 7
#> 8 8 353.1000 14
# Manual indexing approahes -------------------------------------------------
# from https://twitter.com/fartmiasma/status/1231258479865647105
cyl_counts <- sort(unique(mtcars$cyl))
tabl <- sapply(cyl_counts, function(ct) {
with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct)))
})
as.data.frame(t(tabl))
#> cyl mean n
#> 1 4 105.1364 11
#> 2 6 183.3143 7
#> 3 8 353.1000 14
# - coerces all results (and grouping var) to common type
# Similar approach from
# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680
s <- lapply(cyl_counts, function(cyl) {
indx <- mtcars$cyl == cyl
data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx))
})
do.call(rbind, s)
#> cyl mean n
#> 1 4 105.1364 11
#> 2 6 183.3143 7
#> 3 8 353.1000 14
# - harder to generalise to multiple grouping vars (need to use Map())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment