-
-
Save hadley/c430501804349d382ce90754936ab8ec to your computer and use it in GitHub Desktop.
# What's the most natural way to express this code in base R? | |
library(dplyr, warn.conflicts = FALSE) | |
mtcars %>% | |
group_by(cyl) %>% | |
summarise(mean = mean(disp), n = n()) | |
#> # A tibble: 3 x 3 | |
#> cyl mean n | |
#> <dbl> <dbl> <int> | |
#> 1 4 105. 11 | |
#> 2 6 183. 7 | |
#> 3 8 353. 14 | |
# tapply() ---------------------------------------------------------------- | |
data.frame( | |
cyl = sort(unique(mtcars$cyl)), | |
mean = tapply(mtcars$disp, mtcars$cyl, mean), | |
n = tapply(mtcars$disp, mtcars$cyl, length) | |
) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# - hard to generalise to more than one group because tapply() will | |
# return an array | |
# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as | |
# the tapply() output? | |
# aggregate() ------------------------------------------------------------- | |
df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean) | |
df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length) | |
names(df_mean)[2] <- "mean" | |
names(df_length)[2] <- "n" | |
merge(df_mean, df_length, by = "cyl") | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# + generalises in stratightforward to multiple grouping variables and | |
# multiple summary variables | |
# - need to manually rename summary variables | |
# Could also use formula interface | |
# https://twitter.com/tjmahr/status/1231255000766005248 | |
df_mean <- aggregate(disp ~ cyl, mtcars, mean) | |
df_length <- aggregate(disp ~ cyl, mtcars, length) | |
# by() -------------------------------------------------------------------- | |
mtcars_by <- by(mtcars, mtcars$cyl, function(df) { | |
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
}) | |
do.call(rbind, mtcars_by) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# + generalises easily to more/different summaries | |
# - need to know about anonymous functions + do.call + rbind | |
# by() = split() + lapply() | |
mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) { | |
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
}) | |
do.call(rbind, mtcars_by) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# Manual indexing approahes ------------------------------------------------- | |
# from https://twitter.com/fartmiasma/status/1231258479865647105 | |
cyl_counts <- sort(unique(mtcars$cyl)) | |
tabl <- sapply(cyl_counts, function(ct) { | |
with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct))) | |
}) | |
as.data.frame(t(tabl)) | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# - coerces all results (and grouping var) to common type | |
# Similar approach from | |
# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680 | |
s <- lapply(cyl_counts, function(cyl) { | |
indx <- mtcars$cyl == cyl | |
data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx)) | |
}) | |
do.call(rbind, s) | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# - harder to generalise to multiple grouping vars (need to use Map()) |
@llrs added your approach — thanks! How would you use a for loop here?
Like this if you want to be memory efficient:
keys <- unique(mtcars$cyl)
n <- vector("numeric", length(keys))
m <- vector("numeric", length(keys))
for (x in seq_along(keys)) {
k <- mtcars$cyl == keys[x]
n[x] <- sum(k)
m[x] <- mean(mtcars$disp[k]) # if several columns it could be used inside an apply call.
}
data.frame(cyl = keys, mean = m, n = n)
The aggregate
approach could be optimized the following way:
aggregate(disp ~ cyl,
mtcars,
function(x) c(mean = mean(x), n = length(x)))
#> cyl disp.mean disp.n
#> 1 4 105.1364 11.0000
#> 2 6 183.3143 7.0000
#> 3 8 353.1000 14.0000
++ It's much less verbose than the original aggregate approach from above and easier to generalize than the twitter approach with separate calls with df <-
++ no need to adjust the naming of the variables
-- it will return all variables in the same format, that means <dbl>
as soon as there is one variable included that can't be coerced to integer.
-- The result is a data.frame
with two columns (cyl
and disp
) the latter is a matrix.
To remedy the last point, we could wrap the aggregate
in a with
call, but that would be again more verbose:
with(aggregate(disp ~ cyl,
mtcars,
function(x) c(mean = mean(x), n = length(x))),
as.data.frame(cbind(cyl, disp)))
#> cyl mean n
#>1 4 105.1364 11
#>2 6 183.3143 7
#>3 8 353.1000 14
Just after I left the university, I would probably have written something like that :
cyl_u = unique(mtcars$cyl)
res=c()
for(cyl in cyl_u){
keepit=mtcars$cyl==cyl
mean=mean(mtcars[keepit,"disp"])
n=sum(keepit)
res=rbind(res,
data.frame(cyl,mean,n))
}
res
Yes it might be shameful but you said : "How would you use a for loop here?" so here I am...
I learned Base R mostly after the fact, but here's how I was taught to do it, FWIW: write a function as if there were only one group, and make sure it returns the answer in the format you want. Then apply it to all the groups. Can replace some of the below with by
, or combine different parts, but just wanted to convey the thought process most importantly (do for one group first, make sure it works, then apply to all groups):
onecar = function(x) {
data.frame(mean_per_cyl = mean(x$disp),
n = nrow(x))
}
mtsplit = split(mtcars, mtcars$cyl) # could obviously move this step into the function
summ_cyls = do.call(rbind, lapply(mtsplit, onecar))
merge()
has an argument suffixes
which eliminates the need to manually rename the aggregated variables:
df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
merge(df_mean, df_length, by = "cyl", suffixes = c("_mean", "_n"))
imo this is the clearest base R
approach to the problem (probably not the fastest though).
summariseByGroup <- function(groupData, numericData) {
group <- sort(unique(groupData))
nGroups <- length(group)
n <- vector('integer', nGroups)
mean <- vector('numeric', nGroups)
for(g in seq_along(group)) {
for(i in seq_along(numericData)) {
if(groupData[i] == group[g]) {
n[g] <- n[g] + 1
mean[g] <- (mean[g]*(n[g] - 1) + numericData[i])/n[g]
}
}
}
data.frame(group, mean, n)
}
summariseByGroup(mtcars$cyl, mtcars$disp)
Adding the link of the tweet where this was also discussed, so that we can remove it from the vignette, because CRAN url checks.
https://twitter.com/hadleywickham/status/1231252596712771585
dfs <- split(mtcars, mtcars$cyl) # ~ group_by
res <- t(sapply(dfs, function(df) c(mean(df$disp), length(df$disp)))) # ~ summarise
res <- as.data.frame(cbind(as.numeric(names(dfs)), res), # attach cyl values
row.names=1:nrow(res)) # name rows
colnames(res) <- c("cyl", "mean", "n") # name cols
The result:
#>res
# cyl mean n
#1 4 105.1364 11
#2 6 183.3143 7
#3 8 353.1000 14
@hadley In your examples, instead of do.call
one could also use Reduce
instead.
@hadley To your question in your tapply()
example - whether the order of tapply
's output is guaranteed to be the same like sort(unique(mtcars$cyl))
:
Initially, I thought "yes!", because tapply
uses split
from which it overtakes the output order. split
in turn uses factor
to sort the names of the groups, which in turn uses order()
to define the order of the levels. sort.default
uses in its code also order
for sorting. Thus, the output order of the groups must be identical, since tapply()
as well as sort(unique())
both use the same function order()
to order their output.
In case the names are integers, sort.int()
should always lead to the same output order ...
> split.default
function (x, f, drop = FALSE, sep = ".", lex.order = FALSE, ...)
{
if (!missing(...))
.NotYetUsed(deparse(...), error = FALSE)
if (is.list(f))
f <- interaction(f, drop = drop, sep = sep, lex.order = lex.order)
else if (!is.factor(f))
f <- as.factor(f)
else if (drop)
f <- factor(f)
storage.mode(f) <- "integer"
if (is.null(attr(x, "class")))
return(.Internal(split(x, f)))
ind <- .Internal(split(seq_along(x), f))
lapply(ind, function(i) x[i])
}
> factor
function (x = character(), levels, labels = levels, exclude = NA,
ordered = is.ordered(x), nmax = NA)
{
if (is.null(x))
x <- character()
nx <- names(x)
if (missing(levels)) {
y <- unique(x, nmax = nmax)
ind <- order(y)
levels <- unique(as.character(y)[ind])
}
force(ordered)
if (!is.character(x))
x <- as.character(x)
levels <- levels[is.na(match(levels, exclude))]
f <- match(x, levels)
if (!is.null(nx))
names(f) <- nx
if (missing(labels)) {
levels(f) <- as.character(levels)
}
else {
nlab <- length(labels)
if (nlab == length(levels)) {
nlevs <- unique(xlevs <- as.character(labels))
at <- attributes(f)
at$levels <- nlevs
f <- match(xlevs, nlevs)[f]
attributes(f) <- at
}
else if (nlab == 1L)
levels(f) <- paste0(labels, seq_along(levels))
else stop(gettextf("invalid 'labels'; length %d should be 1 or %d",
nlab, length(levels)), domain = NA)
}
class(f) <- c(if (ordered) "ordered", "factor")
f
}
> sort.default
function (x, decreasing = FALSE, na.last = NA, ...)
{
if (is.object(x))
x[order(x, na.last = na.last, decreasing = decreasing)]
else sort.int(x, na.last = na.last, decreasing = decreasing,
...)
}
But then I realized after all this - that if the cyl
column in mtcars
would not be numeric
but a factor
with customized levels
, the ordering methods might differ, and indeed it does:
# if the `cyl` column would be a factor - with a custom ordering of its levels
mtcars_ <- mtcars
mtcars_$cyl <- as.factor(mtcars_$cyl)
levels(mtcars_$cyl) <- c("8", "6", "4")
# then the ordering by `tapply()` differs from the ordering by `sort()`
data.frame(
cyl = sort(unique(mtcars_$cyl)),
mean = tapply(mtcars_$disp, mtcars_$cyl, mean),
n = tapply(mtcars_$disp, mtcars_$cyl, length)
)
# cyl mean n
# 8 8 105.1364 11
# 6 6 183.3143 7
# 4 4 353.1000 14
@gwangjinkim Reduce works pair wise, which is potential slower than providing all data frames to rbind at once (although I don't know if rbind actually takes advantage of that). And nice spotting with the order problem, thanks!
@hadley Thank you, too! I often used Reduce
(because I was coming from Lisp languages, where you have reduce
). do.call
is actually Lisp's apply
- while the apply
functions in R are more like a mapcar
in LIsp ... - so to use do.call
in such situations is then better! Good to know, thank you!
Thank you for the interesting question.
To me the most natural way to express the counts in a group variable in base R is to use table()
, but it coerces the grouped variable into a factor. Since I didn't see its use in the previous answers, here is my trial:
dat <- setNames(as.data.frame(table(mtcars$cyl)), c("cyl", "n"))
dat$cyl <- as.numeric(as.character(dat$cyl))
dat$mean <- sapply(dat$cyl, function(x) mean(with(mtcars, disp[cyl == x])))
dat <- dat[, c(1,3,2)]
dat
# cyl mean n
# 1 4 105.1364 11
# 2 6 183.3143 7
# 3 8 353.1000 14
Note a downside of the formula based aggregate syntax — it drops missing values like a modelling function:
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, length)
#> Month Ozone Temp
#> 1 5 26 26
#> 2 6 9 9
#> 3 7 26 26
#> 4 8 26 26
#> 5 9 29 29
aggregate(airquality[c("Ozone", "Temp")], airquality["Month"], length)
#> Month Ozone Temp
#> 1 5 31 31
#> 2 6 30 30
#> 3 7 31 31
#> 4 8 31 31
#> 5 9 30 30
Created on 2022-10-24 with reprex v2.0.2
The second example doesn't return the same result as the other solutions, you used mpg instead of disp column for the mean.
I would use this or make a
for
loop to avoid the final call torbind
and to create a new data.frame for each case.