-
-
Save xiaodaigh/fbdbbfd32062e33a20b1895af24e1542 to your computer and use it in GitHub Desktop.
sumby code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# only need to be run once to install packages | |
#Pkg.clone("https://github.com/JuliaData/SplitApplyCombine.jl.git") | |
#Pkg.clone("https://github.com/xiaodaigh/FastGroupBy.jl.git") | |
using FastGroupBy, PooledArrays | |
import PooledArrays.PooledArray | |
#const N = Int(2e9/8) | |
const N = 250_000_000 | |
const K = UInt(100) | |
using Base.Threads | |
function bench_sumby_multi_rs() | |
srand(1); | |
id6 = rand(Int32(1):Int32(round(N/K)), N); | |
v1 = rand(Int32(1):Int32(5), N); | |
@elapsed sumby_multi_rs(id6, v1) | |
end | |
function bench_sumby_radixgroup() | |
srand(1) | |
id6 = rand(Int32(1):Int32(round(N/K)), N) | |
v1 = rand(Int32(1):Int32(5), N) | |
# radix sort method | |
@elapsed sumby_radixgroup(id6,v1); | |
end | |
function bench_sumby_radixsort() | |
srand(1) | |
id6 = rand(Int32(1):Int32(round(N/K)), N) | |
v1 = rand(Int32(1):Int32(5), N) | |
# radix sort method | |
@elapsed sumby_radixsort(id6,v1); | |
end | |
bench_mrs = [bench_sumby_multi_rs() for i = 1:5] | |
bench_rg = [bench_sumby_radixgroup() for i = 1:5] | |
bench_rs = [bench_sumby_radixsort() for i = 1:5] | |
1 - mean(bench_mrs)/mean(bench_rs) #49.6% faster | |
1 - mean(bench_mrs)/mean(bench_rg) #37.2% faster | |
1 - mean(bench_rg)/mean(bench_rs) #9.8% faster | |
1 - mean(bench_rg[2:end])/mean(bench_rs[2:end]) #19.2% faster | |
mean(bench_mrs[2:end]) #9.87 | |
mean(bench_rg[2:end]) #15.5 | |
mean(bench_rs[2:end]) #19.2 | |
# generate string ids | |
function randstrarray1(pool, N) | |
K = length(pool) | |
PooledArray(PooledArrays.RefArray(rand(1:K, N)), pool) | |
end | |
srand(1) | |
const pool1 = [@sprintf "id%010d" k for k in 1:(N/K)] | |
const id3 = randstrarray1(pool1, N) | |
v1 = rand(Int32(1):Int32(5), N) | |
# treat it as Pooledarray | |
@time sumby(id3, v1) | |
# treat by as strings and use dictionary method; REALLY SLOW | |
const id3_str = rand(pool1, N) | |
@time sumby_dict(id3_str, v1) | |
# parallelized sum | |
# @time addprocs() # create Julia workers | |
# @time using FastGroupBy | |
# @everywhere using FastGroupBy | |
# @everywhere using SplitApplyCombine | |
# @time psumby(id6,v1) # 35 seconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
N = 2e9/8 | |
K = 100 | |
DT <- data.table( | |
id6 = as.factor(sample(N/K, N, TRUE)), # small groups (int) | |
id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), | |
v1 = sample(5, N, TRUE) # int in range [1,5 | |
) | |
# group by id6 | |
system.time(DT[, sum(v1),keyby = id6]) | |
system.time(DT[, sum(v1),keyby = id6]) | |
# pre-index id6 | |
system.time(setkey(DT, id6)) | |
system.time(DT[, sum(v1),keyby = id6]) | |
system.time(DT[, sum(v1),keyby = id6]) | |
# sort by id3 (strings) | |
system.time(DT[, sum(v1),keyby = id3]) | |
system.time(DT[, sum(v1),keyby = id3]) | |
# pre-index id3 | |
system.time(setkey(DT, id3)) | |
system.time(DT[, sum(v1),id3]) | |
system.time(DT[, sum(v1),id3]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment