Skip to content

Instantly share code, notes, and snippets.

@vishalbelsare
Forked from hadley/lazy.frame.R
Created August 14, 2021 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vishalbelsare/546d78131e38d74fc5137bc8e6d2be7d to your computer and use it in GitHub Desktop.
Save vishalbelsare/546d78131e38d74fc5137bc8e6d2be7d to your computer and use it in GitHub Desktop.

Lazy data frame

opts_chunk$set(tidy = FALSE)
library(vadr)
library(plyr)
library(microbenchmark)
source("lazy.frame.R")

The insights exploited here are that the fastest way to get an environment with vars a, b, c, is:

lazy <- function(a,b,c) (environment())
explicit <- function(...)list2env(list(...))
microbenchmark(
    lazy = lazy(1,2,3),
    explicit = explicit(a=1, b=2, c=3))
## Unit: microseconds
##      expr   min    lq median    uq  max neval
##      lazy 1.167 1.438  1.670 1.859 2557   100
##  explicit 8.963 9.278  9.474 9.730 2939   100

And given such environment and row indices, the fastest way to make a subset is:

lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
e <- lazy(1:10, letters[1:10], as.list(1:10))
microbenchmark(
    lazy = lazy_subset(e, 1:5),
    explicit = explicit_subset(e, 1:5))
## Unit: microseconds
##      expr    min     lq median     uq  max neval
##      lazy  1.672  2.007  2.491  2.752 4616   100
##  explicit 26.773 27.595 27.838 28.208 4742   100

(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)

Setup:

bbI <- idata.frame(baseball)
bbL <- lazy.frame(baseball)
nr <- nrow(baseball)
nc <- ncol(baseball)
colnames <- colnames(baseball)

Row subset + eval:

microbenchmark(
    df = with(baseball[sample(nr, 1), ], rbi/ab),
    idf = with(bbI[sample(nr, 1), ], rbi/ab),
    lazy = with(bbL[sample(nr, 1), ], rbi/ab))
## Unit: microseconds
##  expr     min      lq  median     uq    max neval
##    df  481.70  536.68  588.06  640.1  13572   100
##   idf 8096.44 8483.61 8775.51 9687.7  17320   100
##  lazy   64.31   71.83   96.59  116.1 144077   100

Column subset (same column):

microbenchmark(
    df = baseball["rbi"]$rbi[1],
    idf = bbI["rbi"]$rbi[1],
    lazy = bbL["rbi"]$rbi[1])
## Unit: microseconds
##  expr     min   lq median     uq   max neval
##    df  191.50  228  270.2  477.8  3045   100
##   idf 5160.47 5372 5544.1 5847.5 11341   100
##  lazy   97.42  106  158.1  177.7 50247   100

Column subset (random column):

microbenchmark(
    df = baseball[sample(colnames, 1)][1,1],
    idf = bbI[sample(colnames, 1)][1,1],
    lazy = bbL[sample(colnames, 1)][1,1])
## Unit: microseconds
##  expr  min     lq median     uq  max neval
##    df  287  328.3  357.9  473.2 8151   100
##   idf 1216 1250.4 1273.0 1437.1 6183   100
##  lazy  122  140.7  186.4  248.3 4950   100

Column subset (random set of columns):

(this forces a lot of macro expansion, but is an unusual use pattern)

microbenchmark(
    df = baseball[runif(colnames) > 0.5][1,1],
    idf = bbI[runif(colnames) > 0.5][1,1],
    lazy = bbL[runif(colnames) > 0.5][1,1])
## Unit: microseconds
##  expr  min   lq median    uq   max neval
##    df  827 1802   2194  4211 13364   100
##   idf 1244 1418   1506  1574 52949   100
##  lazy 6793 9007  10014 11326 21305   100

Random access single element:

microbenchmark(
    df = baseball[sample(nr, 1), sample(colnames, 1)],
    idf = bbI[sample(nr, 1), sample(colnames, 1)],
    lazy = bbL[sample(nr, 1), sample(colnames, 1)])
## Unit: microseconds
##  expr    min     lq median     uq     max neval
##    df  82.43  95.48 100.34 120.75   478.7   100
##   idf 288.90 307.75 320.70 394.73 51308.6   100
##  lazy  35.98  39.18  41.81  47.04   134.2   100

Group-by and eval:

microbenchmark(times=1,
               df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
               idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
               lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
## Unit: milliseconds
##  expr     min      lq  median      uq     max neval
##    df   380.7   380.7   380.7   380.7   380.7     1
##   idf 11842.7 11842.7 11842.7 11842.7 11842.7     1
##  lazy   217.3   217.3   217.3   217.3   217.3     1
### Lazy data frame
```{r}
opts_chunk$set(tidy=FALSE)
library(vadr)
library(plyr)
library(microbenchmark)
source("lazy.frame.R")
```
The insights exploited here are that the fastest way to get an environment with vars `a`, `b`, `c`, is:
```{r}
lazy <- function(a,b,c) (environment())
explicit <- function(...)list2env(list(...))
microbenchmark(
lazy = lazy(1,2,3),
explicit = explicit(a=1, b=2, c=3))
```
And given such environment and row indices, the fastest way to make a subset is:
```{r}
lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
e <- lazy(1:10, letters[1:10], as.list(1:10))
microbenchmark(
lazy = lazy_subset(e, 1:5),
explicit = explicit_subset(e, 1:5))
```
(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)
### Setup:
```{r}
bbI <- idata.frame(baseball)
bbL <- lazy.frame(baseball)
nr <- nrow(baseball)
nc <- ncol(baseball)
colnames <- colnames(baseball)
```
### Row subset + eval:
```{r}
microbenchmark(
df = with(baseball[sample(nr, 1), ], rbi/ab),
idf = with(bbI[sample(nr, 1), ], rbi/ab),
lazy = with(bbL[sample(nr, 1), ], rbi/ab))
```
### Column subset (same column):
```{r}
microbenchmark(
df = baseball["rbi"]$rbi[1],
idf = bbI["rbi"]$rbi[1],
lazy = bbL["rbi"]$rbi[1])
```
### Column subset (random column):
```{r}
microbenchmark(
df = baseball[sample(colnames, 1)][1,1],
idf = bbI[sample(colnames, 1)][1,1],
lazy = bbL[sample(colnames, 1)][1,1])
```
### Column subset (random set of columns):
(this forces a lot of macro expansion, but is an unusual use pattern)
```{r}
microbenchmark(
df = baseball[runif(colnames) > 0.5][1,1],
idf = bbI[runif(colnames) > 0.5][1,1],
lazy = bbL[runif(colnames) > 0.5][1,1])
```
### Random access single element:
```{r}
microbenchmark(
df = baseball[sample(nr, 1), sample(colnames, 1)],
idf = bbI[sample(nr, 1), sample(colnames, 1)],
lazy = bbL[sample(nr, 1), sample(colnames, 1)])
```
### Group-by and eval:
```{r}
microbenchmark(times=1,
df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
```
# lazy data frame
#
# like idata.frame, but exploits lazy evaluation + macro code generation
# instead of active bindings
library(vadr)
lazy.frame <- function(df, enclos=parent.frame(), ...) UseMethod("lazy.frame")
lazy.frame.lazy.frame <- function(df, ...) df
lazy.frame.environment <- function(df, ..., col.order=ls(df)) {
#not kosher, environment attrs get set by reference.
attr(df, "accessory") <- accessory_env(df, col.order)
class(df) <- union("lazy.frame", class(df))
df
}
lazy.frame.data.frame <- function(df, enclos=parent.frame()) {
for (n in names(df))
if (is.array(df[[n]])) stop("Array column '", n, "' not supported")
col.order <- structure(names(df), names=names(df))
e <- list2env(df, parent=enclos)
attr(e, "accessory") <- accessory_env(e, col.order)
class(e) <- c("lazy.frame", "environment")
e
}
lazy.frame.default <- function(df, enclos=parent.frame())
lazy.frame(as.data.frame(df), enclos)
accessory_env <- function(
#Holds reference to e, and subsetting methods.
#these are lazy, created on demand.
#also note accessory env always inherits from package, so that
#data env is allowed to inherit from wherever
e,
col.order,
#use do.call because this lets "macro" memoize on the col names
row_subset = do.call(row_subsetter, as.list(col.order)),
col_subset = do.call(col_subsetter, as.list(col.order)),
dim = c(length(e[[col.order[[1]]]]), length(col.order))) {
environment()
}
#Row subset function constructor.
#Arguments are colnames, result is accessor function
row_subsetter <- macro(function(...) {
col.order <- structure(c(...), names=c(...))
#macro looks up entire closure
qe(function(e, rows) {
#create new env with promises to subset each column
new.env <- (function(`.(col.order)`=..(missing_value(length(col.order))))
environment())(
..(qqply(e$`.(col)`[rows])(col=col.order)))
parent.env(new.env) <- parent.env(e)
attr(new.env, "accessory") <- accessory_env(
new.env,
col.order,
#column names do not change, so can reuse existing accessors
attr(e, "accessory")$row_subset,
attr(e , "accessory")$col_subset)
class(new.env) <- c("lazy.frame", "environment")
new.env
})
})
#Column subset function constructor.
#arguments are colnames, result is accessor function
col_subsetter <- macro(function(...) {
col.order <- structure(c(...), names=c(...))
args <- qqply(`.(col)`=e$`.(col)`)(col=col.order)
col_subset_inner(col.order, args)
})
col_subset_inner <- function(col.order, args) {
function(e, cols) {
new.cols <- col.order[cols]
new.env <- do.call(env.list, args[new.cols])
parent.env(new.env) <- parent.env(e)
attr(new.env, "accessory") <- accessory_env(
new.env, new.cols,
col_subset = col_subset_inner(new.cols, args[new.cols]))
class(new.env) <- c("lazy.frame", "environment")
new.env
}
}
env.list <- macro(function(...) {
args <- list(...)
#print(names(args)) #verify caching works
qq((function(`.(names(args))`=..(missing_value(length(args)))) environment())(
..(args)))
})
`[.lazy.frame` <- function(x, i, j, drop=TRUE) {
if (nargs() == 2) {
j <- i
i <- missing_value()
drop <- FALSE
}
if (!missing(j)) {
if (length(j) == 1 && drop && !is.logical(j)) {
if (missing(i)) i <- TRUE
return(x[[j]][i])
} else {
x <- attr(x, "accessory")$col_subset(x, j)
}
}
if (!missing(i)) {
x <- attr(x, "accessory")$row_subset(x, i)
}
x
}
`[[.lazy.frame` <- function(x, i) {
get(attr(x, "accessory")$col.order[[i]], x)
}
dim.lazy.frame <- function(df) {
attr(df, "accessory")$dim
}
names.lazy.frame <- function(df) {
attr(df, "accessory")$col.order
}
as.list.lazy.frame <- function(df) mget(attr(df, "accessory")$col.order, df)
as.data.frame.lazy.frame <- function(df)
plyr::quickdf(as.list(df)[attr(df, "accessory")$col.order])
############################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment