hadley/lazy.frame.R

## readme.md

      
    Raw
  

              readme.md
            
          
    Lazy data frame

opts_chunk$set(tidy = FALSE)
library(vadr)
library(plyr)
library(microbenchmark)
source("lazy.frame.R")
The insights exploited here are that the fastest way to get an environment with vars a, b, c, is:
lazy <- function(a,b,c) (environment())
explicit <- function(...)list2env(list(...))
microbenchmark(
    lazy = lazy(1,2,3),
    explicit = explicit(a=1, b=2, c=3))
## Unit: microseconds
##      expr   min    lq median    uq  max neval
##      lazy 1.167 1.438  1.670 1.859 2557   100
##  explicit 8.963 9.278  9.474 9.730 2939   100

And given such environment and row indices, the fastest way to make a subset is:
lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
e <- lazy(1:10, letters[1:10], as.list(1:10))
microbenchmark(
    lazy = lazy_subset(e, 1:5),
    explicit = explicit_subset(e, 1:5))
## Unit: microseconds
##      expr    min     lq median     uq  max neval
##      lazy  1.672  2.007  2.491  2.752 4616   100
##  explicit 26.773 27.595 27.838 28.208 4742   100

(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)
Setup:

bbI <- idata.frame(baseball)
bbL <- lazy.frame(baseball)
nr <- nrow(baseball)
nc <- ncol(baseball)
colnames <- colnames(baseball)
Row subset + eval:

microbenchmark(
    df = with(baseball[sample(nr, 1), ], rbi/ab),
    idf = with(bbI[sample(nr, 1), ], rbi/ab),
    lazy = with(bbL[sample(nr, 1), ], rbi/ab))
## Unit: microseconds
##  expr     min      lq  median     uq    max neval
##    df  481.70  536.68  588.06  640.1  13572   100
##   idf 8096.44 8483.61 8775.51 9687.7  17320   100
##  lazy   64.31   71.83   96.59  116.1 144077   100

Column subset (same column):

microbenchmark(
    df = baseball["rbi"]$rbi[1],
    idf = bbI["rbi"]$rbi[1],
    lazy = bbL["rbi"]$rbi[1])
## Unit: microseconds
##  expr     min   lq median     uq   max neval
##    df  191.50  228  270.2  477.8  3045   100
##   idf 5160.47 5372 5544.1 5847.5 11341   100
##  lazy   97.42  106  158.1  177.7 50247   100

Column subset (random column):

microbenchmark(
    df = baseball[sample(colnames, 1)][1,1],
    idf = bbI[sample(colnames, 1)][1,1],
    lazy = bbL[sample(colnames, 1)][1,1])
## Unit: microseconds
##  expr  min     lq median     uq  max neval
##    df  287  328.3  357.9  473.2 8151   100
##   idf 1216 1250.4 1273.0 1437.1 6183   100
##  lazy  122  140.7  186.4  248.3 4950   100

Column subset (random set of columns):

(this forces a lot of macro expansion, but is an unusual use pattern)
microbenchmark(
    df = baseball[runif(colnames) > 0.5][1,1],
    idf = bbI[runif(colnames) > 0.5][1,1],
    lazy = bbL[runif(colnames) > 0.5][1,1])
## Unit: microseconds
##  expr  min   lq median    uq   max neval
##    df  827 1802   2194  4211 13364   100
##   idf 1244 1418   1506  1574 52949   100
##  lazy 6793 9007  10014 11326 21305   100

Random access single element:

microbenchmark(
    df = baseball[sample(nr, 1), sample(colnames, 1)],
    idf = bbI[sample(nr, 1), sample(colnames, 1)],
    lazy = bbL[sample(nr, 1), sample(colnames, 1)])
## Unit: microseconds
##  expr    min     lq median     uq     max neval
##    df  82.43  95.48 100.34 120.75   478.7   100
##   idf 288.90 307.75 320.70 394.73 51308.6   100
##  lazy  35.98  39.18  41.81  47.04   134.2   100

Group-by and eval:

microbenchmark(times=1,
               df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
               idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
               lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
## Unit: milliseconds
##  expr     min      lq  median      uq     max neval
##    df   380.7   380.7   380.7   380.7   380.7     1
##   idf 11842.7 11842.7 11842.7 11842.7 11842.7     1
##  lazy   217.3   217.3   217.3   217.3   217.3     1


## readme.Rmd
### Lazy data frame

```{r}
opts_chunk$set(tidy=FALSE)
library(vadr)
library(plyr)
library(microbenchmark)
source("lazy.frame.R")
```

The insights exploited here are that the fastest way to get an environment with vars `a`, `b`, `c`, is:

```{r}
lazy <- function(a,b,c) (environment())
explicit <- function(...)list2env(list(...))
microbenchmark(
    lazy = lazy(1,2,3),
    explicit = explicit(a=1, b=2, c=3))
```

And given such environment and row indices, the fastest way to make a subset is:

```{r}
lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
e <- lazy(1:10, letters[1:10], as.list(1:10))
microbenchmark(
    lazy = lazy_subset(e, 1:5),
    explicit = explicit_subset(e, 1:5))
```

(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)

### Setup:
```{r}
bbI <- idata.frame(baseball)
bbL <- lazy.frame(baseball)
nr <- nrow(baseball)
nc <- ncol(baseball)
colnames <- colnames(baseball)
```

### Row subset + eval:
```{r}
microbenchmark(
    df = with(baseball[sample(nr, 1), ], rbi/ab),
    idf = with(bbI[sample(nr, 1), ], rbi/ab),
    lazy = with(bbL[sample(nr, 1), ], rbi/ab))
```

### Column subset (same column):
```{r}
microbenchmark(
    df = baseball["rbi"]$rbi[1],
    idf = bbI["rbi"]$rbi[1],
    lazy = bbL["rbi"]$rbi[1])
```

### Column subset (random column):
```{r}
microbenchmark(
    df = baseball[sample(colnames, 1)][1,1],
    idf = bbI[sample(colnames, 1)][1,1],
    lazy = bbL[sample(colnames, 1)][1,1])
```

### Column subset (random set of columns):
(this forces a lot of macro expansion, but is an unusual use pattern)
```{r}
microbenchmark(
    df = baseball[runif(colnames) > 0.5][1,1],
    idf = bbI[runif(colnames) > 0.5][1,1],
    lazy = bbL[runif(colnames) > 0.5][1,1])
```

### Random access single element:
```{r}
microbenchmark(
    df = baseball[sample(nr, 1), sample(colnames, 1)],
    idf = bbI[sample(nr, 1), sample(colnames, 1)],
    lazy = bbL[sample(nr, 1), sample(colnames, 1)])
```

### Group-by and eval:
```{r}
microbenchmark(times=1,
               df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
               idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
               lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
```

## lazy.frame.R
# lazy data frame
#
# like idata.frame, but exploits lazy evaluation + macro code generation
# instead of active bindings
library(vadr)

lazy.frame <- function(df, enclos=parent.frame(), ...) UseMethod("lazy.frame")

lazy.frame.lazy.frame <- function(df, ...) df

lazy.frame.environment <- function(df, ..., col.order=ls(df)) {
  #not kosher, environment attrs get set by reference.
  attr(df, "accessory") <- accessory_env(df, col.order)
  class(df) <- union("lazy.frame", class(df))
  df
}

lazy.frame.data.frame <- function(df, enclos=parent.frame()) {
  for (n in names(df))
      if (is.array(df[[n]])) stop("Array column '", n, "' not supported")

  col.order <- structure(names(df), names=names(df))
  e <- list2env(df, parent=enclos)
  attr(e, "accessory") <- accessory_env(e, col.order)
  class(e) <- c("lazy.frame", "environment")
  e
}

lazy.frame.default <- function(df, enclos=parent.frame())
    lazy.frame(as.data.frame(df), enclos)

accessory_env <- function(
    #Holds reference to e, and subsetting methods.
    #these are lazy, created on demand.
    #also note accessory env always inherits from package, so that
    #data env is allowed to inherit from wherever
    e,
    col.order,
    #use do.call because this lets "macro" memoize on the col names
    row_subset = do.call(row_subsetter, as.list(col.order)),
    col_subset = do.call(col_subsetter, as.list(col.order)),
    dim = c(length(e[[col.order[[1]]]]), length(col.order))) {
  environment()
}

#Row subset function constructor.
#Arguments are colnames, result is accessor function
row_subsetter <- macro(function(...) {
  col.order <- structure(c(...), names=c(...))
  #macro looks up entire closure
  qe(function(e, rows) {
    #create new env with promises to subset each column
    new.env <- (function(`.(col.order)`=..(missing_value(length(col.order))))
                environment())(
                    ..(qqply(e$`.(col)`[rows])(col=col.order)))
    parent.env(new.env) <- parent.env(e)
    attr(new.env, "accessory") <- accessory_env(
        new.env,
        col.order,
        #column names do not change, so can reuse existing accessors
        attr(e, "accessory")$row_subset,
        attr(e , "accessory")$col_subset)
    class(new.env) <- c("lazy.frame", "environment")
    new.env
  })
})

#Column subset function constructor.
#arguments are colnames, result is accessor function
col_subsetter <- macro(function(...) {
  col.order <- structure(c(...), names=c(...))
  args <- qqply(`.(col)`=e$`.(col)`)(col=col.order)
  col_subset_inner(col.order, args)
})

col_subset_inner <- function(col.order, args) {
  function(e, cols) {
    new.cols <- col.order[cols]
    new.env <- do.call(env.list, args[new.cols])
    parent.env(new.env) <- parent.env(e)
    attr(new.env, "accessory") <- accessory_env(
        new.env, new.cols,
        col_subset = col_subset_inner(new.cols, args[new.cols]))
    class(new.env) <- c("lazy.frame", "environment")
    new.env
  }
}

env.list <- macro(function(...) {
  args <- list(...)
  #print(names(args)) #verify caching works
  qq((function(`.(names(args))`=..(missing_value(length(args)))) environment())(
      ..(args)))
})

`[.lazy.frame` <- function(x, i, j, drop=TRUE) {
  if (nargs() == 2) {
    j <- i
    i <- missing_value()
    drop <- FALSE
  }
  if (!missing(j)) {
    if (length(j) == 1 && drop && !is.logical(j)) {
      if (missing(i)) i <- TRUE
      return(x[[j]][i])
    } else {
      x <- attr(x, "accessory")$col_subset(x, j)
    }
  }
  if (!missing(i)) {
    x <- attr(x, "accessory")$row_subset(x, i)
  }
  x
}

`[[.lazy.frame` <- function(x, i) {
  get(attr(x, "accessory")$col.order[[i]], x)
}

dim.lazy.frame <- function(df) {
  attr(df, "accessory")$dim
}

names.lazy.frame <- function(df) {
  attr(df, "accessory")$col.order
}

as.list.lazy.frame <- function(df) mget(attr(df, "accessory")$col.order, df)

as.data.frame.lazy.frame <- function(df)
    plyr::quickdf(as.list(df)[attr(df, "accessory")$col.order])

############################################################
	### Lazy data frame

	```{r}
	opts_chunk$set(tidy=FALSE)
	library(vadr)
	library(plyr)
	library(microbenchmark)
	source("lazy.frame.R")
	```

	The insights exploited here are that the fastest way to get an environment with vars `a`, `b`, `c`, is:

	```{r}
	lazy <- function(a,b,c) (environment())
	explicit <- function(...)list2env(list(...))
	microbenchmark(
	lazy = lazy(1,2,3),
	explicit = explicit(a=1, b=2, c=3))
	```

	And given such environment and row indices, the fastest way to make a subset is:

	```{r}
	lazy_subset <- function(e, ix) lazy(e$a[ix], e$b[ix], e$c[ix])
	explicit_subset <- function(e, ix) as.environment(lapply(as.list(e), `[`, ix))
	e <- lazy(1:10, letters[1:10], as.list(1:10))
	microbenchmark(
	lazy = lazy_subset(e, 1:5),
	explicit = explicit_subset(e, 1:5))
	```

	(Fast because no subsets are actually computed, only promises to compute a subset in the future, only for those columns that are demanded)

	### Setup:
	```{r}
	bbI <- idata.frame(baseball)
	bbL <- lazy.frame(baseball)
	nr <- nrow(baseball)
	nc <- ncol(baseball)
	colnames <- colnames(baseball)
	```

	### Row subset + eval:
	```{r}
	microbenchmark(
	df = with(baseball[sample(nr, 1), ], rbi/ab),
	idf = with(bbI[sample(nr, 1), ], rbi/ab),
	lazy = with(bbL[sample(nr, 1), ], rbi/ab))
	```

	### Column subset (same column):
	```{r}
	microbenchmark(
	df = baseball["rbi"]$rbi[1],
	idf = bbI["rbi"]$rbi[1],
	lazy = bbL["rbi"]$rbi[1])
	```

	### Column subset (random column):
	```{r}
	microbenchmark(
	df = baseball[sample(colnames, 1)][1,1],
	idf = bbI[sample(colnames, 1)][1,1],
	lazy = bbL[sample(colnames, 1)][1,1])
	```

	### Column subset (random set of columns):
	(this forces a lot of macro expansion, but is an unusual use pattern)
	```{r}
	microbenchmark(
	df = baseball[runif(colnames) > 0.5][1,1],
	idf = bbI[runif(colnames) > 0.5][1,1],
	lazy = bbL[runif(colnames) > 0.5][1,1])
	```

	### Random access single element:
	```{r}
	microbenchmark(
	df = baseball[sample(nr, 1), sample(colnames, 1)],
	idf = bbI[sample(nr, 1), sample(colnames, 1)],
	lazy = bbL[sample(nr, 1), sample(colnames, 1)])
	```

	### Group-by and eval:
	```{r}
	microbenchmark(times=1,
	df = dlply(baseball, "id", with, mean(rbi)/mean(ab)),
	idf = dlply(bbI, "id", with, mean(rbi)/mean(ab)),
	lazy = dlply(bbL, "id", with, mean(rbi)/mean(ab)))
	```
	# lazy data frame
	#
	# like idata.frame, but exploits lazy evaluation + macro code generation
	# instead of active bindings
	library(vadr)

	lazy.frame <- function(df, enclos=parent.frame(), ...) UseMethod("lazy.frame")

	lazy.frame.lazy.frame <- function(df, ...) df

	lazy.frame.environment <- function(df, ..., col.order=ls(df)) {
	#not kosher, environment attrs get set by reference.
	attr(df, "accessory") <- accessory_env(df, col.order)
	class(df) <- union("lazy.frame", class(df))
	df
	}

	lazy.frame.data.frame <- function(df, enclos=parent.frame()) {
	for (n in names(df))
	if (is.array(df[[n]])) stop("Array column '", n, "' not supported")

	col.order <- structure(names(df), names=names(df))
	e <- list2env(df, parent=enclos)
	attr(e, "accessory") <- accessory_env(e, col.order)
	class(e) <- c("lazy.frame", "environment")
	e
	}

	lazy.frame.default <- function(df, enclos=parent.frame())
	lazy.frame(as.data.frame(df), enclos)

	accessory_env <- function(
	#Holds reference to e, and subsetting methods.
	#these are lazy, created on demand.
	#also note accessory env always inherits from package, so that
	#data env is allowed to inherit from wherever
	e,
	col.order,
	#use do.call because this lets "macro" memoize on the col names
	row_subset = do.call(row_subsetter, as.list(col.order)),
	col_subset = do.call(col_subsetter, as.list(col.order)),
	dim = c(length(e[[col.order[[1]]]]), length(col.order))) {
	environment()
	}

	#Row subset function constructor.
	#Arguments are colnames, result is accessor function
	row_subsetter <- macro(function(...) {
	col.order <- structure(c(...), names=c(...))
	#macro looks up entire closure
	qe(function(e, rows) {
	#create new env with promises to subset each column
	new.env <- (function(`.(col.order)`=..(missing_value(length(col.order))))
	environment())(
	..(qqply(e$`.(col)`[rows])(col=col.order)))
	parent.env(new.env) <- parent.env(e)
	attr(new.env, "accessory") <- accessory_env(
	new.env,
	col.order,
	#column names do not change, so can reuse existing accessors
	attr(e, "accessory")$row_subset,
	attr(e , "accessory")$col_subset)
	class(new.env) <- c("lazy.frame", "environment")
	new.env
	})
	})

	#Column subset function constructor.
	#arguments are colnames, result is accessor function
	col_subsetter <- macro(function(...) {
	col.order <- structure(c(...), names=c(...))
	args <- qqply(`.(col)`=e$`.(col)`)(col=col.order)
	col_subset_inner(col.order, args)
	})

	col_subset_inner <- function(col.order, args) {
	function(e, cols) {
	new.cols <- col.order[cols]
	new.env <- do.call(env.list, args[new.cols])
	parent.env(new.env) <- parent.env(e)
	attr(new.env, "accessory") <- accessory_env(
	new.env, new.cols,
	col_subset = col_subset_inner(new.cols, args[new.cols]))
	class(new.env) <- c("lazy.frame", "environment")
	new.env
	}
	}

	env.list <- macro(function(...) {
	args <- list(...)
	#print(names(args)) #verify caching works
	qq((function(`.(names(args))`=..(missing_value(length(args)))) environment())(
	..(args)))
	})

	`[.lazy.frame` <- function(x, i, j, drop=TRUE) {
	if (nargs() == 2) {
	j <- i
	i <- missing_value()
	drop <- FALSE
	}
	if (!missing(j)) {
	if (length(j) == 1 && drop && !is.logical(j)) {
	if (missing(i)) i <- TRUE
	return(x[[j]][i])
	} else {
	x <- attr(x, "accessory")$col_subset(x, j)
	}
	}
	if (!missing(i)) {
	x <- attr(x, "accessory")$row_subset(x, i)
	}
	x
	}

	`[[.lazy.frame` <- function(x, i) {
	get(attr(x, "accessory")$col.order[[i]], x)
	}

	dim.lazy.frame <- function(df) {
	attr(df, "accessory")$dim
	}

	names.lazy.frame <- function(df) {
	attr(df, "accessory")$col.order
	}

	as.list.lazy.frame <- function(df) mget(attr(df, "accessory")$col.order, df)

	as.data.frame.lazy.frame <- function(df)
	plyr::quickdf(as.list(df)[attr(df, "accessory")$col.order])

	############################################################