joshbode/analytic.R

## analytic.R
library(plyr)

# compute columns - similar to transform and plyr::mutate
compute = function(.data, ..., .append=FALSE) {
  cols = as.list(substitute(list(...))[-1])
  cols = cols[names(cols) != '']

  env = parent.frame()

  # evaluate the columns
  for (name in names(cols)) {
    .data[[name]] = eval(cols[[name]], .data, env)
  }

  if (!.append) {
    # copy input data to result if required in output
    .data = .data[names(cols)]
  }

  return(.data)
}

# test data
n = 10000
d = data.frame(x=0:(n-1) %/% 5 + 1, y=sample(1:(2*n), n), z=trunc(runif(n, 1, 4)))

# rank the data, partitioned by x, ordered by z, y
print(system.time({
  r = unlist(dlply(d, .(x), function(r) {
    order(with(r, order(z, y)))
  }))
}))

# same, but with additional calculations, returning all cols in d
print(system.time({
  d = ddply(d, .(x), transform,
    r=order(order(z, y)),
    s=order(order(y)),
    u=max(y)
  )
}))

print(system.time({
  d = ddply(d, .(x), mutate,
    r=order(order(z, y)),
    s=order(order(y)),
    u=max(y)
  )
}))

print(system.time({
  d = ddply(d, .(x), compute,
    r=order(order(z, y)),
    s=order(order(y)),
    u=min(y),
    l=u
  )
}))

print(system.time({
  d = ddply(d, .(x), compute,
    r=order(order(z, y)),
    s=order(order(y)),
    u=min(y),
    .append=TRUE
  )
}))
	library(plyr)

	# compute columns - similar to transform and plyr::mutate
	compute = function(.data, ..., .append=FALSE) {
	cols = as.list(substitute(list(...))[-1])
	cols = cols[names(cols) != '']

	env = parent.frame()

	# evaluate the columns
	for (name in names(cols)) {
	.data[[name]] = eval(cols[[name]], .data, env)
	}

	if (!.append) {
	# copy input data to result if required in output
	.data = .data[names(cols)]
	}

	return(.data)
	}

	# test data
	n = 10000
	d = data.frame(x=0:(n-1) %/% 5 + 1, y=sample(1:(2*n), n), z=trunc(runif(n, 1, 4)))

	# rank the data, partitioned by x, ordered by z, y
	print(system.time({
	r = unlist(dlply(d, .(x), function(r) {
	order(with(r, order(z, y)))
	}))
	}))

	# same, but with additional calculations, returning all cols in d
	print(system.time({
	d = ddply(d, .(x), transform,
	r=order(order(z, y)),
	s=order(order(y)),
	u=max(y)
	)
	}))

	print(system.time({
	d = ddply(d, .(x), mutate,
	r=order(order(z, y)),
	s=order(order(y)),
	u=max(y)
	)
	}))

	print(system.time({
	d = ddply(d, .(x), compute,
	r=order(order(z, y)),
	s=order(order(y)),
	u=min(y),
	l=u
	)
	}))

	print(system.time({
	d = ddply(d, .(x), compute,
	r=order(order(z, y)),
	s=order(order(y)),
	u=min(y),
	.append=TRUE
	)
	}))