hadley/dplyr-summarise.R

## dplyr-summarise.R
# What's the most natural way to express this code in base R?
library(dplyr, warn.conflicts = FALSE)
mtcars %>%
  group_by(cyl) %>%
  summarise(mean = mean(disp), n = n())
#> # A tibble: 3 x 3
#>     cyl  mean     n
#>   <dbl> <dbl> <int>
#> 1     4  105.    11
#> 2     6  183.     7
#> 3     8  353.    14

# tapply() ----------------------------------------------------------------
data.frame(
  cyl = sort(unique(mtcars$cyl)),
  mean = tapply(mtcars$disp, mtcars$cyl, mean),
  n = tapply(mtcars$disp, mtcars$cyl, length)
)
#>   cyl     mean  n
#> 4   4 105.1364 11
#> 6   6 183.3143  7
#> 8   8 353.1000 14
# - hard to generalise to more than one group because tapply() will
#   return an array
# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as
#   the tapply() output?

# aggregate() -------------------------------------------------------------
df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
names(df_mean)[2] <- "mean"
names(df_length)[2] <- "n"
merge(df_mean, df_length, by = "cyl")
#>   cyl     mean  n
#> 1   4 105.1364 11
#> 2   6 183.3143  7
#> 3   8 353.1000 14

# + generalises in stratightforward to multiple grouping variables and
#   multiple summary variables
# - need to manually rename summary variables

# Could also use formula interface
# https://twitter.com/tjmahr/status/1231255000766005248
df_mean <- aggregate(disp ~ cyl, mtcars, mean)
df_length <- aggregate(disp ~ cyl, mtcars, length)

# by() --------------------------------------------------------------------
mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
  data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
})
do.call(rbind, mtcars_by)
#>   cyl     mean  n
#> 4   4 105.1364 11
#> 6   6 183.3143  7
#> 8   8 353.1000 14

# + generalises easily to more/different summaries
# - need to know about anonymous functions + do.call + rbind

# by() = split() + lapply()
mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) {
  data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
})
do.call(rbind, mtcars_by)
#>   cyl     mean  n
#> 4   4 105.1364 11
#> 6   6 183.3143  7
#> 8   8 353.1000 14

# Manual indexing approahes -------------------------------------------------
# from https://twitter.com/fartmiasma/status/1231258479865647105
cyl_counts <- sort(unique(mtcars$cyl))
tabl <- sapply(cyl_counts, function(ct) {
  with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct)))
})
as.data.frame(t(tabl))
#>   cyl     mean  n
#> 1   4 105.1364 11
#> 2   6 183.3143  7
#> 3   8 353.1000 14

# - coerces all results (and grouping var) to common type

# Similar approach from
# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680
s <- lapply(cyl_counts, function(cyl) {
  indx <- mtcars$cyl == cyl
  data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx))
})
do.call(rbind, s)
#>   cyl     mean  n
#> 1   4 105.1364 11
#> 2   6 183.3143  7
#> 3   8 353.1000 14

# - harder to generalise to multiple grouping vars (need to use Map())
	# What's the most natural way to express this code in base R?
	library(dplyr, warn.conflicts = FALSE)
	mtcars %>%
	group_by(cyl) %>%
	summarise(mean = mean(disp), n = n())
	#> # A tibble: 3 x 3
	#> cyl mean n
	#> <dbl> <dbl> <int>
	#> 1 4 105. 11
	#> 2 6 183. 7
	#> 3 8 353. 14

	# tapply() ----------------------------------------------------------------
	data.frame(
	cyl = sort(unique(mtcars$cyl)),
	mean = tapply(mtcars$disp, mtcars$cyl, mean),
	n = tapply(mtcars$disp, mtcars$cyl, length)
	)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14
	# - hard to generalise to more than one group because tapply() will
	# return an array
	# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as
	# the tapply() output?

	# aggregate() -------------------------------------------------------------
	df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
	df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
	names(df_mean)[2] <- "mean"
	names(df_length)[2] <- "n"
	merge(df_mean, df_length, by = "cyl")
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# + generalises in stratightforward to multiple grouping variables and
	# multiple summary variables
	# - need to manually rename summary variables

	# Could also use formula interface
	# https://twitter.com/tjmahr/status/1231255000766005248
	df_mean <- aggregate(disp ~ cyl, mtcars, mean)
	df_length <- aggregate(disp ~ cyl, mtcars, length)

	# by() --------------------------------------------------------------------
	mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
	data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
	})
	do.call(rbind, mtcars_by)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14

	# + generalises easily to more/different summaries
	# - need to know about anonymous functions + do.call + rbind

	# by() = split() + lapply()
	mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) {
	data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
	})
	do.call(rbind, mtcars_by)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14

	# Manual indexing approahes -------------------------------------------------
	# from https://twitter.com/fartmiasma/status/1231258479865647105
	cyl_counts <- sort(unique(mtcars$cyl))
	tabl <- sapply(cyl_counts, function(ct) {
	with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct)))
	})
	as.data.frame(t(tabl))
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# - coerces all results (and grouping var) to common type

	# Similar approach from
	# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680
	s <- lapply(cyl_counts, function(cyl) {
	indx <- mtcars$cyl == cyl
	data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx))
	})
	do.call(rbind, s)
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# - harder to generalise to multiple grouping vars (need to use Map())