Skip to content

Instantly share code, notes, and snippets.

@binarybana
Last active August 29, 2015 14:01
Show Gist options
  • Save binarybana/60edb3c3a7f925f7bc2d to your computer and use it in GitHub Desktop.
Save binarybana/60edb3c3a7f925f7bc2d to your computer and use it in GitHub Desktop.
Benchmarking Sampling Code
using StatsBase
randsubseq2(A::AbstractVector, m::Integer) = randsubseq2!(Array(eltype(A),m), A)
function randsubseq2!(S::AbstractVector, A::AbstractArray)
m = length(S)
m == 0 && return S # needed for correctness: code below requires m > 0
n = length(A)
m == n && return copy!(S, A)
0 <= m <= n || throw(DomainError())
i = 0; j = 0
while true
if (n - i)*rand() <= m - j
S[j += 1] = A[i += 1]
j == m && return S
else
i += 1
end
end
end
vitter(A::AbstractArray, k::Integer) = vitter!(A,Array(eltype(A),k))
function vitter!(A::AbstractArray, x::AbstractArray)
N = length(A)
n = length(x)
V_prime = exp(log(rand())/n)
quant1 = N-n+1
quant2 = quant1/N
alpha_inverse = 15
threshold = alpha_inverse * n
ai=1
local S
while n>1 && threshold<N
while true
local X
while true
X = N * (1.0 - V_prime)
S = itrunc(X)
S < quant1 && break
V_prime = exp(log(rand())/n)
end
y = rand()/quant2
LHS = exp(log(y)/(n-1))
RHS = ((quant1 - S)/quant1) * (N/(N-X))
if LHS <= RHS # Accept S
V_prime = LHS/RHS
break
end
if n-1>S
bottom = N-n
limit = N-S
else
bottom = N-S-1
limit = quant1
end
for top=N-1:-1:limit
y *= top/bottom
bottom = bottom - 1
end
if exp(log(y)/(n-1)) <= N/(N-X)
# Accept S
V_prime = exp(log(rand())/(n-1))
break
end
V_prime = exp(log(rand())/n)
end
# Select (S+1)st record
N -= S+1
n -= 1
x[end-n] = A[ai+=S]
quant1 -= S
quant2 = quant1/N
threshold -= alpha_inverse
end
if n>1 # Algo A
top = N - n
for n = n-1:-1:2 # FIXME: this n-1 was originally n
V = rand()
S = 1
quot = top/N
while quot > V
S = S+1
top = top-1
N = N-1
quot *= top/N
end
x[end-n] = A[ai+=S]
N-=1
end
end
#One left:
S = itrunc(N*V_prime)
x[end] = A[ai+=S]
end
function rand_first_index(n, k)
r = rand()
p = k/n
i = 1
while p < r
i += 1
p += (1-p)k/(n-(i-1))
end
return i
end
ordered_sample_norep{T}(xs::AbstractArray{T}, k::Integer) = ordered_sample_norep!(xs, Array(T,k))
function ordered_sample_norep!(xs::AbstractArray, target::AbstractArray)
n = length(xs)
k = length(target)
i = 0
for j in 1:k
step = rand_first_index(n, k)
n -= step
i += step
target[j] = xs[i]
k -= 1
end
return target
end
function floyd1{T}(a::AbstractArray{T}, m::Integer)
n = length(a)
0 <= m <= n || throw(DomainError())
# Floyd's Õ(m) algorithm from J. Bentley and B. Floyd, "Programming pearls: A sample of brilliance,"
# Commun. ACM Magazine 30, no. 9, 754--757 (1987).
# Adapted from https://github.com/JuliaLang/julia/issues/6714#issuecomment-42057923
#s = IntSet()
s = Set{Int}()
x = Array(T, m)
for (xind,j) = enumerate(n-m+1:n)
i = rand(1:j)
ind = i in s ? j : i
x[xind] = a[ind]
push!(s, ind)
end
x
end
function floyd1p{T}(a::AbstractArray{T}, m::Integer)
n = length(a)
0 <= m <= n || throw(DomainError())
# Floyd's Õ(m) algorithm from J. Bentley and B. Floyd, "Programming pearls: A sample of brilliance,"
# Commun. ACM Magazine 30, no. 9, 754--757 (1987).
# Adapted from https://github.com/JuliaLang/julia/issues/6714#issuecomment-42057923
#s = IntSet()
s = Set{Int}()
x = T[]
sizehint(x, m)
for j = n-m+1:n
i = rand(1:j)
if i in s
#use j
push!(x,a[j])
push!(s,j)
else
unshift!(x,a[j])
push!(s,i)
end
end
x
end
# warmup
sample(1:1000, 100)
floyd1(1:1000, 100)
floyd1p(1:1000, 100)
ordered_sample_norep(1:1000, 100)
vitter(1:1000, 100)
N=10_000
krange = 5:5:200
navg=200
using PyPlot
close("all")
labels=["sample ordered",
"sample unordered",
"randsubseq2",
"vitter",
"floyd ordered",
"floyd unordered"]
#"one-more-min ordered"]
times = {Float64[] for x=1:length(labels)}
for i=krange
gc_disable()
meas = [median([@elapsed sample(1:N, i, replace=false, ordered=true) for x=1:navg]),
median([@elapsed sample(1:N, i, replace=false) for x=1:navg]),
median([@elapsed randsubseq2(1:N, i) for x=1:navg]),
median([@elapsed vitter(1:N, i) for x=1:navg]),
median([@elapsed floyd1(1:N, i) for x=1:navg]),
median([@elapsed floyd1p(1:N, i) for x=1:navg])]
#median([@elapsed ordered_sample_norep(1:N, i) for x=1:10])]
gc_enable()
gc()
[push!(x,t) for (x,t) in zip(times,meas)]
end
[plot([krange],t,label=l,linewidth=5, alpha=0.7) for (t,l) in zip(times,labels)]
legend(loc="best")
title("From $(1:N), sample ...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment