Skip to content

Instantly share code, notes, and snippets.

@tinybike
Last active August 29, 2015 14:06
Show Gist options
  • Save tinybike/432bdbaa2aff38ee64ee to your computer and use it in GitHub Desktop.
Save tinybike/432bdbaa2aff38ee64ee to your computer and use it in GitHub Desktop.
weighted medians for julia
using Base.Test
function weighted_median{T<:Real,W<:Real}(data::Array{T,1}, weights::Array{W,1};
checknan::Bool=true)
isempty(data) && error("median of an empty array is undefined")
if length(data) != length(weights)
error("data and weight vectors must be the same size")
end
if checknan
for (i, x) in enumerate(data)
(isnan(x) || isnan(weights[i])) && return NaN
end
end
mask = weights .!= 0
if any(mask)
data = data[mask]
weights = weights[mask]
maxval, maxind = findmax(weights)
midpoint = 0.5 * sum(weights)
if maxval > midpoint
data[maxind]
else
permute = sortperm(data)
cumulative_weight = zero(eltype(weights))
i = 0
for (i, p) in enumerate(permute)
if cumulative_weight > midpoint
cumulative_weight -= weights[p]
break
end
cumulative_weight += weights[p]
end
if cumulative_weight == midpoint
middle(data[permute[i-2]], data[permute[i-1]])
else
middle(data[permute[i-1]])
end
end
else
error("No positive weights found")
end
end
# Tests
macro array(ex, num_sets)
data = (Expr)[]
for i = 1:eval(num_sets)
push!(data, eval(ex))
end
map(eval, data)
end
function timer(data, weights, trial)
num_tests = length(data)
println("Trial " * string(trial) * ":")
@time time_weighted_median(data, weights, num_tests)
end
function time_weighted_median(data, weights, num_tests)
for i = 1:num_tests
weighted_median(data[i], weights[i])
end
end
# Small data sets with known answers
data = (
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10, 15],
[1, 2, 4, 7, 10, 15],
[0, 10, 20, 30],
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
[30, 40, 50, 60, 35],
[2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7, 0.4],
[3.7, 3.3, 3.5, 2.8],
[100, 125, 123, 60, 45, 56, 66],
[2, 2, 2, 2, 2, 2],
[2.3],
[-2, -3, 1, 2, -10],
[1, 2, 3, 4, 5],
)
weights = (
[1, 1/3, 1/3, 1/3, 1],
[1, 1, 1, 1, 1],
[1, 1/3, 1/3, 1/3, 1, 1],
[1/3, 1/3, 1/3, 1, 1, 1],
[30, 191, 9, 0],
[10, 1, 1, 1, 9],
[10, 1, 1, 1, 900],
[1, 3, 5, 4, 2],
[2, 2, 0, 1, 2, 2, 1, 6, 0],
[5, 5, 4, 1],
[30, 56, 144, 24, 55, 43, 67],
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
[12],
[7, 1, 1, 1, 6],
[1, 0, 0, 0, 2],
)
median_answers = (7.0, 4.0, 8.5,
8.5, 10.0, 2.5,
5.0, 50.0, 1.7,
3.5, 100.0, 2.0,
2.3, -2.0, 5.0)
# Test for accuracy
num_tests = length(data)
for i = 1:num_tests
@test weighted_median(data[i], weights[i]) == median_answers[i]
end
@test_throws ErrorException weighted_median([4, 3, 2, 1], [0, 0, 0, 0])
@test_throws ErrorException weighted_median((Float64)[], (Float64)[])
v = [4, 3, 2, 1]
wt = [1, 2, 3, 4, 5]
@test_throws ErrorException weighted_median(v, wt)
@test_throws MethodError weighted_median([4 3 2 1 0], wt)
@test_throws MethodError weighted_median([[1 2];[4 5];[7 8];[10 11];[13 14]], wt)
v = [1, 3, 2, NaN, 2]
@test isnan(weighted_median(v, wt))
wt = [1, 2, NaN, 4, 5]
@test isnan(weighted_median(v, wt))
v = [5, 4, 3, 2, 1]
wt = [1, 2, -3, 4, -5]
@test weighted_median(v, wt) == 2.0
v = [-10, 1, 1, -10, -10]
wt = [-1, -1, -1, -1, -1]
@test weighted_median(v, wt) == -10
# Throwaway timings (compiling)
@time time_weighted_median(data, weights, num_tests)
# Performance tests
num_points = int(1e6)
num_sets = 10
data_range = 1000
wt_range = 100
println(num_sets, " data sets (", num_points, " data points each)")
# weights: [0, wt_range)
wt = @array(:($wt_range * rand($num_points)), num_sets)
# data:
# [-data_range/2, data_range/2),
# [0, data_range),
# [-data_range, 0)
data_expr = (:($data_range * (rand($num_points) - 0.5)),
:($data_range * rand($num_points)),
:(-$data_range * rand($num_points)))
timer(@array(data_expr[1], num_sets), wt, 1)
timer(@array(data_expr[2], num_sets), wt, 2)
timer(@array(data_expr[3], num_sets), wt, 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment