Skip to content

Instantly share code, notes, and snippets.

@wvanbergen
Created September 3, 2010 10:57
Show Gist options
  • Save wvanbergen/563750 to your computer and use it in GitHub Desktop.
Save wvanbergen/563750 to your computer and use it in GitHub Desktop.
# settings
@bucket_count = 1000 # Must be integer > 0
@min_value = 10 ** -4 # Must be > 0.0
@max_value = 10 ** +2 # Must be > @min_value
# precalculate some stuff
@min_value_log = Math.log(@min_value)
@max_value_log = Math.log(@max_value)
@log_delta = @max_value_log - @min_value_log
@bucket_size = @log_delta / @bucket_count.to_f
# Initialize bucket counters the be 0
@buckets = Array.new(@bucket_count, 0)
# Returns the bucket index for a value
def bucket_index(value)
return 0 if value < @min_value
return @bucket_count - 1 if value >= @max_value
((Math.log(value) - @min_value_log) / @bucket_size).floor
end
# Returns the lower value of a bucket given its index
def bucket_lower_bound(index)
Math.exp((index * @bucket_size) + @min_value_log)
end
# Returns the upper value of a bucket given its index
def bucket_upper_bound(index)
bucket_lower_bound(index + 1)
end
# Returns the average of the lower and upper bound of the bucket.
def bucket_average_value(index)
(bucket_lower_bound(index) + bucket_upper_bound(index)) / 2
end
# Returns a single value representing a bucket.
def bucket_value(index, type = nil)
case type
when :begin, :start, :lower, :lower_bound; bucket_lower_bound(index)
when :end, :finish, :upper, :upper_bound; bucket_upper_bound(index)
else bucket_average_value(index)
end
end
# Returns the range of values for a bucket.
def bucket_interval(index)
Range.new(bucket_lower_bound(index), bucket_upper_bound(index), true)
end
# Records a hit on a bucket that includes the given value.
def bucketize(value)
@buckets[bucket_index(value)] += 1
end
# Returns the total number of hits
def hits
@hits ||= @buckets.inject(0) { |carry, count| carry + count }
end
# Returns the upper bound value that would include x% of the hits.
def percentile_index(x, inclusive = false)
total_encountered = 0
@buckets.each_with_index do |count, index|
total_encountered += count
percentage = ((total_encountered.to_f / hits.to_f) * 100).floor
return index if (inclusive && percentage >= x) || (!inclusive && percentage > x)
end
end
def percentile_indices(start, finish)
result = [nil, nil]
total_encountered = 0
@buckets.each_with_index do |count, index|
total_encountered += count
percentage = ((total_encountered.to_f / hits.to_f) * 100).floor
if !result[0] && percentage > start
result[0] = index
elsif !result[1] && percentage >= finish
result[1] = index
return result
end
end
end
def percentile(x, type = nil)
bucket_value(percentile_index(x, type == :upper), type)
end
# Returns a percentile interval, i.e. the lower bound and the upper bound of the values
# that represent the x%-interval for the bucketized dataset.
#
# A 90% interval means that 5% of the values would have been lower than the lower bound and
# 5% would have been higher than the upper bound, leaving 90% of the values within the bounds.
# You can also provide a Range to specify the lower bound and upper bound percentages (e.g. 5..95).
def percentile_interval(x)
case x
when Range
indices = percentile_indices(x.begin, x.end)
Range.new(bucket_lower_bound(indices[0]), bucket_upper_bound(indices[1]))
when Numeric
percentile_interval(Range.new((100 - x) / 2, (100 - (100 - x) / 2)))
else
raise 'What does it mean?'
end
end
########
# TEST #
########
bucketize(0.2)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.3)
bucketize(0.4)
puts "Median: #{percentile(50, :average)}"
puts
puts "90% interval: #{percentile_interval(90)}"
puts "80% interval: #{percentile_interval(80)}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment