| # What's the most natural way to express this code in base R? | |
| library(dplyr, warn.conflicts = FALSE) | |
| mtcars %>% | |
| group_by(cyl) %>% | |
| summarise(mean = mean(disp), n = n()) | |
| #> # A tibble: 3 x 3 | |
| #> cyl mean n | |
| #> <dbl> <dbl> <int> | |
| #> 1 4 105. 11 | |
| #> 2 6 183. 7 | |
| #> 3 8 353. 14 | |
| # tapply() ---------------------------------------------------------------- | |
| data.frame( | |
| cyl = sort(unique(mtcars$cyl)), | |
| mean = tapply(mtcars$disp, mtcars$cyl, mean), | |
| n = tapply(mtcars$disp, mtcars$cyl, length) | |
| ) | |
| #> cyl mean n | |
| #> 4 4 105.1364 11 | |
| #> 6 6 183.3143 7 | |
| #> 8 8 353.1000 14 | |
| # - hard to generalise to more than one group because tapply() will | |
| # return an array | |
| # - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as | |
| # the tapply() output? | |
| # aggregate() ------------------------------------------------------------- | |
| df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean) | |
| df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length) | |
| names(df_mean)[2] <- "mean" | |
| names(df_length)[2] <- "n" | |
| merge(df_mean, df_length, by = "cyl") | |
| #> cyl mean n | |
| #> 1 4 105.1364 11 | |
| #> 2 6 183.3143 7 | |
| #> 3 8 353.1000 14 | |
| # + generalises in stratightforward to multiple grouping variables and | |
| # multiple summary variables | |
| # - need to manually rename summary variables | |
| # Could also use formula interface | |
| # https://twitter.com/tjmahr/status/1231255000766005248 | |
| df_mean <- aggregate(disp ~ cyl, mtcars, mean) | |
| df_length <- aggregate(disp ~ cyl, mtcars, length) | |
| # by() -------------------------------------------------------------------- | |
| mtcars_by <- by(mtcars, mtcars$cyl, function(df) { | |
| data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
| }) | |
| do.call(rbind, mtcars_by) | |
| #> cyl mean n | |
| #> 4 4 105.1364 11 | |
| #> 6 6 183.3143 7 | |
| #> 8 8 353.1000 14 | |
| # + generalises easily to more/different summaries | |
| # - need to know about anonymous functions + do.call + rbind | |
| # by() = split() + lapply() | |
| mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) { | |
| data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
| }) | |
| do.call(rbind, mtcars_by) | |
| #> cyl mean n | |
| #> 4 4 105.1364 11 | |
| #> 6 6 183.3143 7 | |
| #> 8 8 353.1000 14 | |
| # Manual indexing approahes ------------------------------------------------- | |
| # from https://twitter.com/fartmiasma/status/1231258479865647105 | |
| cyl_counts <- sort(unique(mtcars$cyl)) | |
| tabl <- sapply(cyl_counts, function(ct) { | |
| with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct))) | |
| }) | |
| as.data.frame(t(tabl)) | |
| #> cyl mean n | |
| #> 1 4 105.1364 11 | |
| #> 2 6 183.3143 7 | |
| #> 3 8 353.1000 14 | |
| # - coerces all results (and grouping var) to common type | |
| # Similar approach from | |
| # https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680 | |
| s <- lapply(cyl_counts, function(cyl) { | |
| indx <- mtcars$cyl == cyl | |
| data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx)) | |
| }) | |
| do.call(rbind, s) | |
| #> cyl mean n | |
| #> 1 4 105.1364 11 | |
| #> 2 6 183.3143 7 | |
| #> 3 8 353.1000 14 | |
| # - harder to generalise to multiple grouping vars (need to use Map()) |
This comment has been minimized.
This comment has been minimized.
|
@llrs added your approach — thanks! How would you use a for loop here? |
This comment has been minimized.
This comment has been minimized.
|
Like this if you want to be memory efficient:
|
This comment has been minimized.
This comment has been minimized.
|
The aggregate(disp ~ cyl,
mtcars,
function(x) c(mean = mean(x), n = length(x)))
#> cyl disp.mean disp.n
#> 1 4 105.1364 11.0000
#> 2 6 183.3143 7.0000
#> 3 8 353.1000 14.0000++ It's much less verbose than the original aggregate approach from above and easier to generalize than the twitter approach with separate calls with To remedy the last point, we could wrap the with(aggregate(disp ~ cyl,
mtcars,
function(x) c(mean = mean(x), n = length(x))),
as.data.frame(cbind(cyl, disp)))
#> cyl mean n
#>1 4 105.1364 11
#>2 6 183.3143 7
#>3 8 353.1000 14 |
This comment has been minimized.
This comment has been minimized.
|
Just after I left the university, I would probably have written something like that :
Yes it might be shameful but you said : "How would you use a for loop here?" so here I am... |
This comment has been minimized.
This comment has been minimized.
|
I learned Base R mostly after the fact, but here's how I was taught to do it, FWIW: write a function as if there were only one group, and make sure it returns the answer in the format you want. Then apply it to all the groups. Can replace some of the below with
|
This comment has been minimized.
This comment has been minimized.
|
|
This comment has been minimized.
This comment has been minimized.
|
imo this is the clearest base
|
This comment has been minimized.
The second example doesn't return the same result as the other solutions, you used mpg instead of disp column for the mean.
I would use this or make a
forloop to avoid the final call torbindand to create a new data.frame for each case.