Skip to content

Instantly share code, notes, and snippets.

@xiaodaigh
Last active November 12, 2017 09:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xiaodaigh/fbdbbfd32062e33a20b1895af24e1542 to your computer and use it in GitHub Desktop.
Save xiaodaigh/fbdbbfd32062e33a20b1895af24e1542 to your computer and use it in GitHub Desktop.
sumby code
# only need to be run once to install packages
#Pkg.clone("https://github.com/JuliaData/SplitApplyCombine.jl.git")
#Pkg.clone("https://github.com/xiaodaigh/FastGroupBy.jl.git")
using FastGroupBy, PooledArrays
import PooledArrays.PooledArray
#const N = Int(2e9/8)
const N = 250_000_000
const K = UInt(100)
using Base.Threads
function bench_sumby_multi_rs()
srand(1);
id6 = rand(Int32(1):Int32(round(N/K)), N);
v1 = rand(Int32(1):Int32(5), N);
@elapsed sumby_multi_rs(id6, v1)
end
function bench_sumby_radixgroup()
srand(1)
id6 = rand(Int32(1):Int32(round(N/K)), N)
v1 = rand(Int32(1):Int32(5), N)
# radix sort method
@elapsed sumby_radixgroup(id6,v1);
end
function bench_sumby_radixsort()
srand(1)
id6 = rand(Int32(1):Int32(round(N/K)), N)
v1 = rand(Int32(1):Int32(5), N)
# radix sort method
@elapsed sumby_radixsort(id6,v1);
end
bench_mrs = [bench_sumby_multi_rs() for i = 1:5]
bench_rg = [bench_sumby_radixgroup() for i = 1:5]
bench_rs = [bench_sumby_radixsort() for i = 1:5]
1 - mean(bench_mrs)/mean(bench_rs) #49.6% faster
1 - mean(bench_mrs)/mean(bench_rg) #37.2% faster
1 - mean(bench_rg)/mean(bench_rs) #9.8% faster
1 - mean(bench_rg[2:end])/mean(bench_rs[2:end]) #19.2% faster
mean(bench_mrs[2:end]) #9.87
mean(bench_rg[2:end]) #15.5
mean(bench_rs[2:end]) #19.2
# generate string ids
function randstrarray1(pool, N)
K = length(pool)
PooledArray(PooledArrays.RefArray(rand(1:K, N)), pool)
end
srand(1)
const pool1 = [@sprintf "id%010d" k for k in 1:(N/K)]
const id3 = randstrarray1(pool1, N)
v1 = rand(Int32(1):Int32(5), N)
# treat it as Pooledarray
@time sumby(id3, v1)
# treat by as strings and use dictionary method; REALLY SLOW
const id3_str = rand(pool1, N)
@time sumby_dict(id3_str, v1)
# parallelized sum
# @time addprocs() # create Julia workers
# @time using FastGroupBy
# @everywhere using FastGroupBy
# @everywhere using SplitApplyCombine
# @time psumby(id6,v1) # 35 seconds
library(data.table)
N = 2e9/8
K = 100
DT <- data.table(
id6 = as.factor(sample(N/K, N, TRUE)), # small groups (int)
id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE),
v1 = sample(5, N, TRUE) # int in range [1,5
)
# group by id6
system.time(DT[, sum(v1),keyby = id6])
system.time(DT[, sum(v1),keyby = id6])
# pre-index id6
system.time(setkey(DT, id6))
system.time(DT[, sum(v1),keyby = id6])
system.time(DT[, sum(v1),keyby = id6])
# sort by id3 (strings)
system.time(DT[, sum(v1),keyby = id3])
system.time(DT[, sum(v1),keyby = id3])
# pre-index id3
system.time(setkey(DT, id3))
system.time(DT[, sum(v1),id3])
system.time(DT[, sum(v1),id3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment