Skip to content

Instantly share code, notes, and snippets.

@jangorecki
Created May 11, 2020 01:15
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jangorecki/ef1bb100cdd46f77f84c3aeda56fbe41 to your computer and use it in GitHub Desktop.
mergelist left-right
library(data.table)
setDTthreads(40L)
test.data.table("mergelist.Rraw") ## warmup
set.seed(108)
N = 1e8L
## medium cardinality
region = data.table(region_id=seq_len(N/1e3), key="region_id")
division = data.table(division_id=seq_len(N/1e2), region_id=sample(N/1e3, N/1e2, TRUE), key="division_id")
setindexv(division, "region_id")
state = data.table(state_id=seq_len(N/1e1), division_id=sample(N/1e2, N/1e1, TRUE), key="state_id")
setindexv(state, "division_id")
fact = data.table(state_id=sample(N/1e1, N, TRUE), population=1)
setindexv(fact, "state_id")
l = list(fact, state, division, region)
sapply(l, nrow)
system.time(ans1<-mergelist(l, how="left"))
system.time(ans2<-mergelist(rev(l), how="right"))
all.equal(ans1, ans2, ignore.col.order=TRUE)
rm(ans1, ans2)
## high cardinality
region = data.table(region_id=seq_len(N/8), key="region_id")
division = data.table(division_id=seq_len(N/4), region_id=sample(N/8, N/4, TRUE), key="division_id")
setindexv(division, "region_id")
state = data.table(state_id=seq_len(N/2), division_id=sample(N/4, N/2, TRUE), key="state_id")
setindexv(state, "division_id")
fact = data.table(state_id=sample(N/2, N, TRUE), population=1)
setindexv(fact, "state_id")
l = list(fact, state, division, region)
sapply(l, nrow)
system.time(ans1<-mergelist(l, how="left"))
system.time(ans2<-mergelist(rev(l), how="right"))
all.equal(ans1, ans2, ignore.col.order=TRUE)
rm(ans1, ans2)
## low cardinality
region = data.table(region_id=seq_len(N/1e6), key="region_id")
division = data.table(division_id=seq_len(N/1e4), region_id=sample(N/1e6, N/1e4, TRUE), key="division_id")
setindexv(division, "region_id")
state = data.table(state_id=seq_len(N/1e2), division_id=sample(N/1e4, N/1e2, TRUE), key="state_id")
setindexv(state, "division_id")
fact = data.table(state_id=sample(N/1e2, N, TRUE), population=1)
setindexv(fact, "state_id")
l = list(fact, state, division, region)
sapply(l, nrow)
system.time(ans1<-mergelist(l, how="left"))
system.time(ans2<-mergelist(rev(l), how="right"))
all.equal(ans1, ans2, ignore.col.order=TRUE)
rm(ans1, ans2)
> library(data.table)
data.table 1.12.9 IN DEVELOPMENT built 2020-05-11 01:05:19 UTC; jan using 20 threads (see ?getDTthreads). Latest news: r-datatable.com
> setDTthreads(40L)
> test.data.table("mergelist.Rraw") ## warmup
getDTthreads(verbose=TRUE):
omp_get_num_procs() 40
R_DATATABLE_NUM_PROCS_PERCENT unset (default 50)
R_DATATABLE_NUM_THREADS unset
omp_get_thread_limit() 2147483647
omp_get_max_threads() 40
OMP_THREAD_LIMIT unset
OMP_NUM_THREADS unset
RestoreAfterFork true
data.table is using 40 threads. See ?setDTthreads.
test.data.table() running: /usr/local/lib/R/site-library/data.table/tests/mergelist.Rraw
Running test id 251.04
Running test id 291.06
Sun May 10 18:12:34 2020 endian==little, sizeof(long double)==16, sizeof(pointer)==8, TZ=America/Los_Angeles, locale='LC_CTYPE=en_US.UTF-8;LC_NUMERI
C=C;LC_TIME=C.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=C.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=C.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEAS
UREMENT=C.UTF-8;LC_IDENTIFICATION=C', l10n_info()='MBCS=TRUE; UTF-8=TRUE; Latin-1=FALSE', getDTthreads()='omp_get_num_procs()==40; R_DATATABLE_NUM_PR
OCS_PERCENT==unset (default 50); R_DATATABLE_NUM_THREADS==unset; omp_get_thread_limit()==2147483647; omp_get_max_threads()==40; OMP_THREAD_LIMIT==uns
et; OMP_NUM_THREADS==unset; RestoreAfterFork==true; data.table is using 40 threads. See ?setDTthreads.'
10 longest running tests took 1s (58% of 2s)
ID time nTest
1: 28 0.188 64
2: 291 0.181 6
3: 102 0.169 43
4: 121 0.134 54
5: 26 0.131 64
6: 103 0.129 48
7: 22 0.129 64
8: 25 0.124 64
9: 27 0.111 64
10: 21 0.103 64
All 957 tests in tests/mergelist.Rraw completed ok in 2.537s elapsed (00:01:26 cpu)
>
> set.seed(108)
> N = 1e8L
>
> ## medium cardinality
> region = data.table(region_id=seq_len(N/1e3), key="region_id")
> division = data.table(division_id=seq_len(N/1e2), region_id=sample(N/1e3, N/1e2, TRUE), key="division_id")
> setindexv(division, "region_id")
> state = data.table(state_id=seq_len(N/1e1), division_id=sample(N/1e2, N/1e1, TRUE), key="state_id")
> setindexv(state, "division_id")
> fact = data.table(state_id=sample(N/1e1, N, TRUE), population=1)
> setindexv(fact, "state_id")
> l = list(fact, state, division, region)
> sapply(l, nrow)
[1] 100000000 10000000 1000000 100000
> system.time(ans1<-mergelist(l, how="left"))
user system elapsed
66.078 10.530 19.845
> system.time(ans2<-mergelist(rev(l), how="right"))
user system elapsed
29.692 4.947 9.334
> all.equal(ans1, ans2, ignore.col.order=TRUE)
[1] TRUE
> rm(ans1, ans2)
>
> ## high cardinality
> region = data.table(region_id=seq_len(N/8), key="region_id")
> division = data.table(division_id=seq_len(N/4), region_id=sample(N/8, N/4, TRUE), key="division_id")
> setindexv(division, "region_id")
> state = data.table(state_id=seq_len(N/2), division_id=sample(N/4, N/2, TRUE), key="state_id")
> setindexv(state, "division_id")
> fact = data.table(state_id=sample(N/2, N, TRUE), population=1)
> setindexv(fact, "state_id")
> l = list(fact, state, division, region)
> sapply(l, nrow)
[1] 100000000 50000000 25000000 12500000
> system.time(ans1<-mergelist(l, how="left"))
user system elapsed
95.045 13.896 32.975
> system.time(ans2<-mergelist(rev(l), how="right"))
user system elapsed
69.442 10.305 22.856
> all.equal(ans1, ans2, ignore.col.order=TRUE)
[1] TRUE
> rm(ans1, ans2)
>
> ## low cardinality
> region = data.table(region_id=seq_len(N/1e6), key="region_id")
> division = data.table(division_id=seq_len(N/1e4), region_id=sample(N/1e6, N/1e4, TRUE), key="division_id")
> setindexv(division, "region_id")
> state = data.table(state_id=seq_len(N/1e2), division_id=sample(N/1e4, N/1e2, TRUE), key="state_id")
> setindexv(state, "division_id")
> fact = data.table(state_id=sample(N/1e2, N, TRUE), population=1)
> setindexv(fact, "state_id")
> l = list(fact, state, division, region)
> sapply(l, nrow)
[1] 100000000 1000000 10000 100
> system.time(ans1<-mergelist(l, how="left"))
user system elapsed
42.330 11.342 16.467
> system.time(ans2<-mergelist(rev(l), how="right"))
user system elapsed
23.495 5.346 7.861
> all.equal(ans1, ans2, ignore.col.order=TRUE)
[1] TRUE
> rm(ans1, ans2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment