Skip to content

Instantly share code, notes, and snippets.

@xiaodaigh xiaodaigh/string_sort_perf.jl Secret
Last active May 9, 2018

Embed
What would you like to do?
CodeMentor.io: String sort performance
using SortingLab
using RCall, PyCall, DataFrames, BenchmarkTools, IterTools, CSV
using Plots, BenchmarkTools
gr()
function string_sort_timing_cmp(N, K, idnumlen = 10)
srand(1);
svec1 = rand(["id"*dec(i,idnumlen) for i in 1:N÷K], N);
julia_timing = @belapsed radixsort($svec1)
println(julia_timing)
julia_timing_base = @elapsed sort(svec1);
println(julia_timing_base)
R"""
# increase memory limit; only works on Windows
if (Sys.info()["sysname"] == "Windows") memory.limit(32653)
id3 = sample(sprintf("id%010d",1:($N/$K)), $N, TRUE) # small groups (char)
pt = proc.time()
sort(id3, method = "radix")
r_timing = proc.time() - pt
rm(id3)
gc()
"""
@rget r_timing
println(r_timing[3])
py"""
import numpy as np
import timeit
# randChar is workaround for MemoryError in mtrand.RandomState.choice
# http://stackoverflow.com/questions/25627161/how-to-solve-memory-error-in-mtrand-randomstate-choice
def randChar(f, numGrp, N) :
things = [f%x for x in range(numGrp)]
return [things[x] for x in np.random.choice(numGrp, N)]
id3 = np.array(randChar("id%010d", $N//$K, $N)) # small groups (char)
tt = timeit.Timer("id3.sort()" ,"from __main__ import id3").timeit(1) # 6.8 seconds
"""
python_timing = py"tt"
DataFrame( lang = ["Julia", "Julia", "R", "Python"],
alg = ["radix", "base", "radix", "base"],
timings = [julia_timing, julia_timing_base, r_timing[3], python_timing],
N = repeat([N], inner = 4),
K = repeat([K], inner = 4),
idnumlen = repeat([idnumlen], inner = 4),
grps=[N÷K for i=1:4])
end
# sorting a vector with no duplicates
res10m = string_sort_timing_cmp(10_000_000, 1, 10);
CSV.write("res10m.csv", res10m);
res10m = CSV.read("res10m.csv");
gb = bar(res10m[:lang].*"-".*res10m[:alg], res10m[:timings], label="Seconds")
title!(gb,"String sort performance: 10m unique id strings")
ylabel!(gb, "Seconds")
savefig(gb, "sort_perf_10m_u.png")
# sorting a vector with lots of duplicates
res10m100 = string_sort_timing_cmp(10_000_000, 100, 10);
CSV.write("res10m100.csv", res10m100);
res10m100 = CSV.read("res10m100.csv");
gb = bar(res10m100[:lang].*"-".*res10m100[:alg], res10m100[:timings], label="Seconds")
title!(gb,"String sort performance: 10m strings 100k unique values")
ylabel!(gb, "Seconds")
savefig(gb, "sort_perf_10m100_u.png")
# res100m100 = string_sort_timing_cmp(100_000_000, 100, 10);
# CSV.write("res100m100.csv", res100m100);
# res100m100 = CSV.read("res100m100.csv");
# gb = bar(res100m100[:lang].*"-".*res100m100[:alg], res100m100[:timings], label="Seconds")
# title!(gb,"String sort performance: 100m strings 1m unique values")
# ylabel!(gb, "Seconds")
# savefig(gb, "sort_perf_100m1m_u.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.