Skip to content

Instantly share code, notes, and snippets.

@jangorecki
Last active June 20, 2020 22:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jangorecki/0cf9170bee01ee6763719ada418c81e7 to your computer and use it in GitHub Desktop.
Save jangorecki/0cf9170bee01ee6763719ada418c81e7 to your computer and use it in GitHub Desktop.
parallel bmerge
ssa = function(unq_n, size, sort=FALSE) {
if (unq_n > size) return(sample.int(unq_n, size))
unq_sub = seq_len(unq_n)
ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
if (sort) sort(ans) else ans
}
set.seed(108)
library(data.table)
options(width=200)
options(datatable.auto.index=FALSE, datatable.verbose=FALSE) ## not needed but just to be future proof if forder will setindex
N = 1e9L
## unsorted no duplicates
d1 = data.table(x=ssa(N, N))[, "v1":=seq_len(.N)]
d2 = data.table(y=ssa(N, N))[, "v2":=seq_len(.N)]
#d1 = data.table(x=ssa(N-1L, N))[, "v1":=seq_len(.N)] ## unsorted single duplicate
#d2 = data.table(y=ssa(N-1L, N))[, "v2":=seq_len(.N)]
## no index
setDTthreads(1L)
system.time(b <- d1[d2, on="x==y"])
setDTthreads(40L)
system.time(B <- d1[d2, on="x==y"])
all.equal(b, B)
## index
setindexv2 = function(x, cols) { ## pretend we are after #4386
stopifnot(is.data.table(x), is.character(cols))
if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer())
setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), data.table:::forderv(x, cols, retGrp=TRUE))
invisible(x)
}
setindexv2(d1, "x"); setindexv2(d2, "y")
setDTthreads(1L)
system.time(b <- d1[d2, on="x==y"])
setDTthreads(40L)
system.time(B <- d1[d2, on="x==y"])
all.equal(b, B)
## sorted index
setkeyv(d1, "x"); setkeyv(d2, "y");
setindexv2(d1, "x"); setindexv2(d2, "y")
setDTthreads(1L)
system.time(b <- d1[d2, on="x==y"])
setDTthreads(40L)
system.time(B <- d1[d2, on="x==y"])
all.equal(b, B)
options(datatable.verbose=TRUE)
setDTthreads(1L)
system.time(b <- d1[d2, on="x==y"])
setDTthreads(40L)
system.time(B <- d1[d2, on="x==y"])
## all timings are timings of X[Y, on=.] rather than bmerge/smerge only
## bmerge is current master bmerge
## smerge is sort-merge
## Bmerge is parallel bmerge
## smerge as of 103b9c63cdb10c7ab60dd2a3f185caf20b2eb70d
## Bmerge as of 484384a3890730882ffe91b45f906189e0e7c23e
# no duplicates
## single thread no index
user system elapsed
bmerge 394.931 32.802 427.750
smerge 336.447 73.674 410.139
Bmerge 397.821 28.743 426.583
## all threads no index
user system elapsed
bmerge 819.928 143.668 368.546
smerge 1136.770 166.111 100.471
Bmerge 3038.182 146.055 142.944
## all threads index
user system elapsed
bmerge 658.381 103.473 377.290
smerge 579.559 109.165 78.886
Bmerge 2735.321 73.923 142.985
## all threads sorted index
user system elapsed
bmerge 68.579 47.075 69.485
smerge 37.985 42.468 20.842
Bmerge 2315.506 38.060 87.270
# single duplicate
## single thread no index
user system elapsed
bmerge 429.513 34.142 463.676
smerge 367.919 80.007 447.952
Bmerge 423.764 34.258 458.043
## all threads no index
user system elapsed
bmerge 819.215 149.786 368.750
smerge 1191.925 212.859 137.033
Bmerge 3045.483 178.684 172.440
## all threads index
user system elapsed
bmerge 654.823 98.173 379.881
smerge 623.015 160.435 115.979
Bmerge 2776.128 91.119 184.812
## all threads sorted index
user system elapsed
bmerge 87.594 47.124 94.729
smerge 71.851 64.715 42.599
Bmerge 2290.993 44.478 109.372
# no duplicates
> setDTthreads(1L)
> system.time(b <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 35.4s elapsed (32.6s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 0.000s elapsed (0.000s cpu)
user system elapsed
52.008 16.121 68.132
> setDTthreads(40L)
> system.time(B <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 00:01:03 elapsed (00:38:22 cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 0.000s elapsed (0.000s cpu)
user system elapsed
2328.887 47.733 88.212
# single duplicate
> setDTthreads(1L)
> system.time(b <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 36.9s elapsed (34.1s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 19.8s elapsed (15.5s cpu)
user system elapsed
70.631 20.086 90.720
> setDTthreads(40L)
> system.time(B <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 00:01:02 elapsed (00:37:50 cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 19.7s elapsed (15.5s cpu)
user system elapsed
2312.471 42.391 107.200
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment