briatte/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Here's a simple timing test of aggregation functions in R, using 1.3 million rows and 80,000 groups of real data on a 1.8GHz Intel Core i5. Thanks to Arun Srinivasan for helpful comments.
The fastest function to run through the data.frame benchmark is data.table, which runs twice faster than dplyr, which runs ten times faster than base R.
For a benchmark that includes plyr, see this earlier Gist for a computationally more intensive test on half a million rows, where dplyr still runs 1.5 times faster than aggregate in base R.
Both tests confirm what W. Andrew Barr blogged on dplyr:

the 2 most important improvements in dplyr are

a MASSIVE increase in speed, making dplyr useful on big data sets
the ability to chain operations together in a natural order


Tony Fischetti has clear examples of the latter, and Erick Gregory shows that easy access to SQL databases should also be added to the list.

  
## results.r
> # data
> system.time(load("integritate.rda"))
   user  system elapsed
 14.716   0.273  15.173

> # base
> system.time(aggregate(URL ~ Functie, length, data = data))
   user  system elapsed
 26.118   0.284  26.510

> # dplyr
> system.time(as.data.frame(summarise(group_by(data, Functie), n = length(URL))))
   user  system elapsed
  0.242   0.011   0.254
> system.time(summarise(group_by(data, Functie), n = length(URL)))
   user  system elapsed
  0.249   0.006   0.257
> system.time(tbl <- group_by(data, Functie))
   user  system elapsed
  0.183   0.005   0.187
> system.time(summarise(tbl, n = length(URL)))
   user  system elapsed
  0.050   0.001   0.050

> # data.table
> system.time(as.data.frame(as.data.table(data)[, .N, by = Functie]))
> library(data.table)
   user  system elapsed
  1.173   0.038   1.233
> system.time(as.data.table(data)[, .N, by = Functie])
   user  system elapsed
  0.080   0.048   0.128
> system.time(data.table(data)[, .N, by = Functie])
   user  system elapsed
  3.300   0.171   3.508
> system.time(data <- as.data.table(data))
   user  system elapsed
  0.037   0.032   0.069
> system.time(data <- data.table(data))
   user  system elapsed
  0.258   0.094   0.353
> system.time(data[, .N, by = Functie])
   user  system elapsed
  0.031   0.002   0.034

> # versions
> sessionInfo()
R version 3.0.3 (2014-03-06)
Platform: x86_64-apple-darwin10.8.0 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] data.table_1.9.2 dplyr_0.1.2

loaded via a namespace (and not attached):
[1] assertthat_0.1 plyr_1.8.1     Rcpp_0.11.0    reshape2_1.2.2 stringr_0.6.2
[6] tools_3.0.3

## test.r
setwd("/Users/fr/Documents/Code/R/integritate")

# data
system.time(load("integritate.rda"))
str(data[, c("Functie", "URL")])

# base
length(unique(data$Functie))
system.time(aggregate(URL ~ Functie, length, data = data))

# plyr (far too long)
# library(plyr)
# system.time(ddply(data, .(Functie), summarise, n = length(URL)))

library(dplyr)
system.time(as.data.frame(summarise(group_by(data, Functie), n = length(URL))))
system.time(summarise(group_by(data, Functie), n = length(URL)))
system.time(tbl <- group_by(data, Functie))
system.time(summarise(tbl, n = length(URL)))

# data.table
library(data.table)
system.time(as.data.frame(as.data.table(data)[, .N, by = Functie]))
system.time(as.data.table(data)[, .N, by = Functie])
system.time(data.table(data)[, .N, by = Functie])
system.time(data <- as.data.table(data))
system.time(data <- data.table(data))
system.time(data[, .N, by = Functie])

# versions
sessionInfo()
	> # data
	> system.time(load("integritate.rda"))
	user system elapsed
	14.716 0.273 15.173

	> # base
	> system.time(aggregate(URL ~ Functie, length, data = data))
	user system elapsed
	26.118 0.284 26.510

	> # dplyr
	> system.time(as.data.frame(summarise(group_by(data, Functie), n = length(URL))))
	user system elapsed
	0.242 0.011 0.254
	> system.time(summarise(group_by(data, Functie), n = length(URL)))
	user system elapsed
	0.249 0.006 0.257
	> system.time(tbl <- group_by(data, Functie))
	user system elapsed
	0.183 0.005 0.187
	> system.time(summarise(tbl, n = length(URL)))
	user system elapsed
	0.050 0.001 0.050

	> # data.table
	> system.time(as.data.frame(as.data.table(data)[, .N, by = Functie]))
	> library(data.table)
	user system elapsed
	1.173 0.038 1.233
	> system.time(as.data.table(data)[, .N, by = Functie])
	user system elapsed
	0.080 0.048 0.128
	> system.time(data.table(data)[, .N, by = Functie])
	user system elapsed
	3.300 0.171 3.508
	> system.time(data <- as.data.table(data))
	user system elapsed
	0.037 0.032 0.069
	> system.time(data <- data.table(data))
	user system elapsed
	0.258 0.094 0.353
	> system.time(data[, .N, by = Functie])
	user system elapsed
	0.031 0.002 0.034

	> # versions
	> sessionInfo()
	R version 3.0.3 (2014-03-06)
	Platform: x86_64-apple-darwin10.8.0 (64-bit)

	locale:
	[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

	attached base packages:
	[1] stats graphics grDevices utils datasets methods base

	other attached packages:
	[1] data.table_1.9.2 dplyr_0.1.2

	loaded via a namespace (and not attached):
	[1] assertthat_0.1 plyr_1.8.1 Rcpp_0.11.0 reshape2_1.2.2 stringr_0.6.2
	[6] tools_3.0.3
	setwd("/Users/fr/Documents/Code/R/integritate")

	# data
	system.time(load("integritate.rda"))
	str(data[, c("Functie", "URL")])

	# base
	length(unique(data$Functie))
	system.time(aggregate(URL ~ Functie, length, data = data))

	# plyr (far too long)
	# library(plyr)
	# system.time(ddply(data, .(Functie), summarise, n = length(URL)))

	library(dplyr)
	system.time(as.data.frame(summarise(group_by(data, Functie), n = length(URL))))
	system.time(summarise(group_by(data, Functie), n = length(URL)))
	system.time(tbl <- group_by(data, Functie))
	system.time(summarise(tbl, n = length(URL)))

	# data.table
	library(data.table)
	system.time(as.data.frame(as.data.table(data)[, .N, by = Functie]))
	system.time(as.data.table(data)[, .N, by = Functie])
	system.time(data.table(data)[, .N, by = Functie])
	system.time(data <- as.data.table(data))
	system.time(data <- data.table(data))
	system.time(data[, .N, by = Functie])

	# versions
	sessionInfo()