opts_chunk$set(tidy = FALSE)
library(vadr)
library(plyr)
library(microbenchmark)
source("lazy.frame.R")
The insights exploited here are that the fastest way to get an environment with vars a
, b
, c
, is:
lazy <- function(a,b,c) (environment())
explicit <- function(...)list2env(list(...))
microbenchmark(
lazy = lazy(1,2,3),
explicit = explicit(a=1, b=2, c=3))
## Unit: microseconds
## expr min lq median uq max neval
## lazy 1.167 1.438 1.670 1.859 2557 100
## explicit 8.963 9.278 9.474 9.730 2939 100
And given such environment and row indices, the fastest way to make a subset is:
lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
e <- lazy(1:10, letters[1:10], as.list(1:10))
microbenchmark(
lazy = lazy_subset(e, 1:5),
explicit = explicit_subset(e, 1:5))
## Unit: microseconds
## expr min lq median uq max neval
## lazy 1.672 2.007 2.491 2.752 4616 100
## explicit 26.773 27.595 27.838 28.208 4742 100
(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)
bbI <- idata.frame(baseball)
bbL <- lazy.frame(baseball)
nr <- nrow(baseball)
nc <- ncol(baseball)
colnames <- colnames(baseball)
microbenchmark(
df = with(baseball[sample(nr, 1), ], rbi/ab),
idf = with(bbI[sample(nr, 1), ], rbi/ab),
lazy = with(bbL[sample(nr, 1), ], rbi/ab))
## Unit: microseconds
## expr min lq median uq max neval
## df 481.70 536.68 588.06 640.1 13572 100
## idf 8096.44 8483.61 8775.51 9687.7 17320 100
## lazy 64.31 71.83 96.59 116.1 144077 100
microbenchmark(
df = baseball["rbi"]$rbi[1],
idf = bbI["rbi"]$rbi[1],
lazy = bbL["rbi"]$rbi[1])
## Unit: microseconds
## expr min lq median uq max neval
## df 191.50 228 270.2 477.8 3045 100
## idf 5160.47 5372 5544.1 5847.5 11341 100
## lazy 97.42 106 158.1 177.7 50247 100
microbenchmark(
df = baseball[sample(colnames, 1)][1,1],
idf = bbI[sample(colnames, 1)][1,1],
lazy = bbL[sample(colnames, 1)][1,1])
## Unit: microseconds
## expr min lq median uq max neval
## df 287 328.3 357.9 473.2 8151 100
## idf 1216 1250.4 1273.0 1437.1 6183 100
## lazy 122 140.7 186.4 248.3 4950 100
(this forces a lot of macro expansion, but is an unusual use pattern)
microbenchmark(
df = baseball[runif(colnames) > 0.5][1,1],
idf = bbI[runif(colnames) > 0.5][1,1],
lazy = bbL[runif(colnames) > 0.5][1,1])
## Unit: microseconds
## expr min lq median uq max neval
## df 827 1802 2194 4211 13364 100
## idf 1244 1418 1506 1574 52949 100
## lazy 6793 9007 10014 11326 21305 100
microbenchmark(
df = baseball[sample(nr, 1), sample(colnames, 1)],
idf = bbI[sample(nr, 1), sample(colnames, 1)],
lazy = bbL[sample(nr, 1), sample(colnames, 1)])
## Unit: microseconds
## expr min lq median uq max neval
## df 82.43 95.48 100.34 120.75 478.7 100
## idf 288.90 307.75 320.70 394.73 51308.6 100
## lazy 35.98 39.18 41.81 47.04 134.2 100
microbenchmark(times=1,
df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
## Unit: milliseconds
## expr min lq median uq max neval
## df 380.7 380.7 380.7 380.7 380.7 1
## idf 11842.7 11842.7 11842.7 11842.7 11842.7 1
## lazy 217.3 217.3 217.3 217.3 217.3 1