arunsrinivasan/FR_5241.R

## FR_5241.R
require(data.table)

# let's create data huge data.table
set.seed(1)
N <- 2e7 # size of DT

# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)

DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)),
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
                 d = sample(ch, N, TRUE))

# Let's create a key on column 'c'
setkey(DT, c)

# Now, let's say we want to 'sum(b)' grouped by 'c', then:
# we can do it as follows...

system.time(ans1 <- DT[, list(e = sum(b)), by=c])
#   user  system elapsed
#  1.751   0.019   1.775

# We can alternatively do it as follows:
system.time(ans2 <- DT[J(unique(c)), list(e= sum(b))])
#   user  system elapsed
#  2.411   0.246   2.684

# HERE'S THE IMPROVEMENT THAT'S ASKED FOR IN FR #5241 (2.684 to 2.03 seconds)
# here the call to 'unique' could be replaced with 'uniqlist' because we know 'c' is sorted!
system.time(ans3 <- DT[J(c[data.table:::uniqlist(list(c))]), list(e=sum(b))])
#   user  system elapsed
#  1.993   0.029   2.028

identical(ans1, ans2) # [1] TRUE
identical(ans1, ans3) # [1] TRUE

# Note that the same should be done when doing, for ex: 'DT[CJ(unique(.), unique(.)), ...]' as CJ
# here will be using, key columns of DT.

# FR_5241 ENDS HERE

##############################################


# MORE OBSERVATIONS:
# I see that it's still slower than using 'by=..', but not as much. Probably if we return 'unique values'
# by reference, we could save the step 'c[...]'. Probably it could be implemented as an internal unique
# function to be called only on the 'first' key column... (to replace 'base:::unique' on a vector).

# okay, testing it out...

system.time(uc <- DT$c[data.table:::uniqlist(list(DT$c))])
#  user  system elapsed
#  0.252   0.046   0.303

system.time(ans4 <- DT[J(uc), list(e=sum(b))])
#   user  system elapsed
#  1.798   0.016   1.819

# Still doesn't match the speed of using 'by=' here.. why? (probably another FR).
	require(data.table)

	# let's create data huge data.table
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	# Let's create a key on column 'c'
	setkey(DT, c)

	# Now, let's say we want to 'sum(b)' grouped by 'c', then:
	# we can do it as follows...

	system.time(ans1 <- DT[, list(e = sum(b)), by=c])
	# user system elapsed
	# 1.751 0.019 1.775

	# We can alternatively do it as follows:
	system.time(ans2 <- DT[J(unique(c)), list(e= sum(b))])
	# user system elapsed
	# 2.411 0.246 2.684

	# HERE'S THE IMPROVEMENT THAT'S ASKED FOR IN FR #5241 (2.684 to 2.03 seconds)
	# here the call to 'unique' could be replaced with 'uniqlist' because we know 'c' is sorted!
	system.time(ans3 <- DT[J(c[data.table:::uniqlist(list(c))]), list(e=sum(b))])
	# user system elapsed
	# 1.993 0.029 2.028

	identical(ans1, ans2) # [1] TRUE
	identical(ans1, ans3) # [1] TRUE

	# Note that the same should be done when doing, for ex: 'DT[CJ(unique(.), unique(.)), ...]' as CJ
	# here will be using, key columns of DT.

	# FR_5241 ENDS HERE

	##############################################


	# MORE OBSERVATIONS:
	# I see that it's still slower than using 'by=..', but not as much. Probably if we return 'unique values'
	# by reference, we could save the step 'c[...]'. Probably it could be implemented as an internal unique
	# function to be called only on the 'first' key column... (to replace 'base:::unique' on a vector).

	# okay, testing it out...

	system.time(uc <- DT$c[data.table:::uniqlist(list(DT$c))])
	# user system elapsed
	# 0.252 0.046 0.303

	system.time(ans4 <- DT[J(uc), list(e=sum(b))])
	# user system elapsed
	# 1.798 0.016 1.819

	# Still doesn't match the speed of using 'by=' here.. why? (probably another FR).