Skip to content

Instantly share code, notes, and snippets.

@robey
Last active April 30, 2018 07:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robey/6941164 to your computer and use it in GitHub Desktop.
Save robey/6941164 to your computer and use it in GitHub Desktop.
poor man's implementation of a biased quantile distribution, from the paper "Effective Computation of Biased Quantiles over Data Streams".
util = require 'util'
class BiasedQuantileDistribution
constructor: (@percentiles = [ 0.5, 0.9, 0.95 ], @error = 0.01) ->
@buffer = []
@bufferSize = Math.floor(1 / (2 * @error))
@samples = []
@count = 0
record: (data) ->
@buffer.push data
if @buffer.length >= @bufferSize
@flush()
flush: ->
@buffer.sort((a, b) -> a - b)
rank = 0
# merge-sort the buffer into @samples, compacting @samples as we go.
bi = 0
si = 0
while bi < @buffer.length or si < @samples.length
if si > 0 and si + 1 < @samples.length
s = @samples[si]
s2 = @samples[si + 1]
if s.g + s2.g + s2.delta <= @maximumSlop(rank)
# compact!
@samples.splice(si, 2, { v: s2.v, g: s.g + s2.g, delta: s2.delta })
continue
if bi < @buffer.length
v = @buffer[bi]
if si == @samples.length or @samples[si].v >= v
# insert!
@count += 1
ss = { v: v, g: 1, delta: if si == 0 then 0 else Math.max(Math.floor(@maximumSlop(rank - @samples[si - 1].g)) - 1, 0) }
@samples.splice(si, 0, ss)
bi += 1
continue
if si < @samples.length
rank += @samples[si].g
si += 1
@buffer = []
return
get: (percentile) ->
desiredRank = @count * percentile
desiredError = @maximumSlop(desiredRank) / 2
rank = 0
sIndex = 0
sLength = @samples.length
while sIndex < sLength and rank + @samples[sIndex].g + @samples[sIndex].delta <= desiredRank + desiredError
rank += @samples[sIndex].g
sIndex += 1
if sIndex > 0 then sIndex -= 1
@samples[sIndex].v
maximumSlop: (rank) ->
slops = @percentiles.map (p) =>
if rank <= p * @count
2 * @error * rank / p
else
2 * @error * (@count - rank) / (1 - p)
Math.min.apply(Math, slops)
debug: (name) ->
console.log ">>> #{name}: samples (#{@count}, #{@samples.length}): " + (@samples.map (s) -> "#{s.v}(#{s.g}:#{s.delta})").join(", ")
class PseudoRandom
constructor: (@seed) ->
next: ->
@seed = (@seed * 9301 + 49297) % 233280
@seed / 233280
powerDistribution: (count, max, exponent) ->
[0 ... count].map (n) => Math.floor(Math.pow(@next(), exponent) * max)
actualPercentile = (samples, percentile) ->
s = samples[...].sort((a, b) -> a - b)
s[Math.floor(percentile * samples.length)]
checkResults = (percentile, dist, samples, desiredError) ->
index = Math.floor(percentile * samples.length)
actual = samples[index]
estimate = dist.get(percentile)
# figure out rank offset
i = 0
while (index + i < 0) or (index + i >= samples.length) or (samples[index + i] != estimate)
if i > 0 then i = -i else i = -i + 1
error = Math.abs((estimate - actual) / actual)
rankError = Math.abs(i / index)
console.log "#{percentile * 100}%: est=#{estimate} actual=#{actual}: e=#{error} re=#{rankError}"
if rankError > desiredError then throw new Error("BAD: rankError out of bounds")
runTests = (name, samples, percentiles, error) ->
dist = new BiasedQuantileDistribution(percentiles)
for d in samples then dist.record(d)
dist.flush()
sortedSamples = samples[...]
sortedSamples.sort((a, b) -> a - b)
sampleCount = dist.samples.length
sampleRate = Math.round(100 * sampleCount / samples.length)
console.log "--- #{name}: (samples=#{sampleCount}, coverage=#{sampleRate}%)"
for p in percentiles then checkResults(p, dist, sortedSamples, error)
exports.main = ->
percentiles = [ 0.5, 0.75, 0.9, 0.95 ]
error = 0.01
for seed in [ 1337 ... 1347 ]
for power in [ 1, 2, 3 ]
rg = new PseudoRandom(seed)
samples = rg.powerDistribution(1000, 50000, power)
name = "seed #{seed}: 1k of 50k of power-#{power}"
runTests("#{name} random", samples, percentiles, error)
samples.sort((a, b) -> a - b)
runTests("#{name} sorted", samples, percentiles, error)
samples.sort((a, b) -> b - a)
runTests("#{name} reverse sorted", samples, percentiles, error)
samples = rg.powerDistribution(5000, 100000, power)
name = "seed #{seed}: 5k of 100k of power-#{power}"
runTests("#{name} random", samples, percentiles, error)
samples.sort((a, b) -> a - b)
runTests("#{name} sorted", samples, percentiles, error)
samples.sort((a, b) -> b - a)
runTests("#{name} reverse sorted", samples, percentiles, error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment