Skip to content

Instantly share code, notes, and snippets.

@aquasync
Created January 21, 2019 07:37
Show Gist options
  • Save aquasync/20006038a6d06b63ba4f3402e87e3024 to your computer and use it in GitHub Desktop.
Save aquasync/20006038a6d06b63ba4f3402e87e3024 to your computer and use it in GitHub Desktop.
library(data.table)
dyn.load('mmap_alloc.dll')
## create test ~74GB dataset, similar to
## https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping
N=2e9; K=100
set.seed(1)
DT = list()
# empty mmap columns
DT$id1 = .Call('mmap_vector', 1L, 'id1.bin', N)
DT$id2 = .Call('mmap_vector', 1L, 'id2.bin', N)
DT$id3 = .Call('mmap_vector', 1L, 'id3.bin', N)
DT$id4 = .Call('mmap_vector', 1L, 'id4.bin', N)
DT$id5 = .Call('mmap_vector', 1L, 'id5.bin', N)
DT$id6 = .Call('mmap_vector', 1L, 'id6.bin', N)
DT$v1 = .Call('mmap_vector', 1L, 'v1.bin', N)
DT$v2 = .Call('mmap_vector', 1L, 'v2.bin', N)
DT$v3 = .Call('mmap_vector', 1.0, 'v3.bin', N)
setDT(DT)
# populate each column
fac1 = factor(sprintf("id%03d",1:K))
fac2 = factor(sprintf("id%03d",1:K))
fac3 = factor(sprintf("id%010d",1:(N/K)))
n = 1e7 # batch size
for (i in seq_len(N/n)) {
print(i)
DT[((i-1)*n+1):(i*n), `:=`(
id1 = sample(fac1, n, TRUE), # large groups (char)
id2 = sample(fac2, n, TRUE), # large groups (char)
id3 = sample(fac3, n, TRUE), # small groups (char)
id4 = sample(K, n, TRUE), # large groups (int)
id5 = sample(K, n, TRUE), # large groups (int)
id6 = sample(N/K, n, TRUE), # small groups (int)
v1 = sample(5, n, TRUE), # int in range [1,5]
v2 = sample(5, n, TRUE), # int in range [1,5]
v3 = sample(round(runif(100,max=100),4), n, TRUE) # numeric e.g. 23.5749
)]
}
setattr(DT$id1, 'class', 'factor')
setattr(DT$id1, 'levels', levels(fac1))
setattr(DT$id2, 'class', 'factor')
setattr(DT$id2, 'levels', levels(fac2))
setattr(DT$id3, 'class', 'factor')
setattr(DT$id3, 'levels', levels(fac3))
saveRDS(attributes(DT$id1), 'id1.rds')
saveRDS(attributes(DT$id2), 'id2.rds')
saveRDS(attributes(DT$id3), 'id3.rds')
# dataset is now persisted, can quit R
q()
library(data.table)
dyn.load('mmap_alloc.dll')
## in a fresh session, map the test dataset
DT = list()
# populate mmap columns
DT$id1 = .Call('mmap_vector', 1L, 'id1.bin', N)
DT$id2 = .Call('mmap_vector', 1L, 'id2.bin', N)
DT$id3 = .Call('mmap_vector', 1L, 'id3.bin', N)
DT$id4 = .Call('mmap_vector', 1L, 'id4.bin', N)
DT$id5 = .Call('mmap_vector', 1L, 'id5.bin', N)
DT$id6 = .Call('mmap_vector', 1L, 'id6.bin', N)
DT$v1 = .Call('mmap_vector', 1L, 'v1.bin', N)
DT$v2 = .Call('mmap_vector', 1L, 'v2.bin', N)
DT$v3 = .Call('mmap_vector', 1.0, 'v3.bin', N)
attrs = readRDS('id1.rds')
for (a in names(attrs)) setattr(DT$id1, a, attrs[[a]])
attrs = readRDS('id2.rds')
for (a in names(attrs)) setattr(DT$id2, a, attrs[[a]])
attrs = readRDS('id3.rds')
for (a in names(attrs)) setattr(DT$id3, a, attrs[[a]])
rm(attrs)
setDT(DT)
as.numeric(object.size(DT)) / (1024^3)
# => [1] 75.84692
print(DT)
# =>
# id1 id2 id3 id4 id5 id6 v1 v2 v3
# 1: id027 id041 id0019118478 18 7 11405826 1 1 66.5912
# 2: id038 id080 id0010451099 69 5 4004947 4 1 83.3882
# 3: id058 id061 id0001640694 24 79 5784478 5 2 24.3000
# 4: id091 id003 id0015476884 52 82 11509253 1 3 24.8819
# 5: id021 id072 id0012410384 16 24 428139 1 3 14.0597
# ---
# 1999999996: id082 id082 id0014122699 33 32 15834208 3 4 21.1701
# 1999999997: id087 id005 id0012759520 84 77 10500557 5 5 9.7848
# 1999999998: id025 id013 id0000760988 30 69 11662435 3 4 13.4854
# 1999999999: id061 id056 id0000080427 41 27 4990964 1 3 96.5136
# 2000000000: id044 id097 id0016173239 48 24 1969290 3 2 32.9924
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment