Created
September 3, 2010 10:57
-
-
Save wvanbergen/563750 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# settings | |
@bucket_count = 1000 # Must be integer > 0 | |
@min_value = 10 ** -4 # Must be > 0.0 | |
@max_value = 10 ** +2 # Must be > @min_value | |
# precalculate some stuff | |
@min_value_log = Math.log(@min_value) | |
@max_value_log = Math.log(@max_value) | |
@log_delta = @max_value_log - @min_value_log | |
@bucket_size = @log_delta / @bucket_count.to_f | |
# Initialize bucket counters the be 0 | |
@buckets = Array.new(@bucket_count, 0) | |
# Returns the bucket index for a value | |
def bucket_index(value) | |
return 0 if value < @min_value | |
return @bucket_count - 1 if value >= @max_value | |
((Math.log(value) - @min_value_log) / @bucket_size).floor | |
end | |
# Returns the lower value of a bucket given its index | |
def bucket_lower_bound(index) | |
Math.exp((index * @bucket_size) + @min_value_log) | |
end | |
# Returns the upper value of a bucket given its index | |
def bucket_upper_bound(index) | |
bucket_lower_bound(index + 1) | |
end | |
# Returns the average of the lower and upper bound of the bucket. | |
def bucket_average_value(index) | |
(bucket_lower_bound(index) + bucket_upper_bound(index)) / 2 | |
end | |
# Returns a single value representing a bucket. | |
def bucket_value(index, type = nil) | |
case type | |
when :begin, :start, :lower, :lower_bound; bucket_lower_bound(index) | |
when :end, :finish, :upper, :upper_bound; bucket_upper_bound(index) | |
else bucket_average_value(index) | |
end | |
end | |
# Returns the range of values for a bucket. | |
def bucket_interval(index) | |
Range.new(bucket_lower_bound(index), bucket_upper_bound(index), true) | |
end | |
# Records a hit on a bucket that includes the given value. | |
def bucketize(value) | |
@buckets[bucket_index(value)] += 1 | |
end | |
# Returns the total number of hits | |
def hits | |
@hits ||= @buckets.inject(0) { |carry, count| carry + count } | |
end | |
# Returns the upper bound value that would include x% of the hits. | |
def percentile_index(x, inclusive = false) | |
total_encountered = 0 | |
@buckets.each_with_index do |count, index| | |
total_encountered += count | |
percentage = ((total_encountered.to_f / hits.to_f) * 100).floor | |
return index if (inclusive && percentage >= x) || (!inclusive && percentage > x) | |
end | |
end | |
def percentile_indices(start, finish) | |
result = [nil, nil] | |
total_encountered = 0 | |
@buckets.each_with_index do |count, index| | |
total_encountered += count | |
percentage = ((total_encountered.to_f / hits.to_f) * 100).floor | |
if !result[0] && percentage > start | |
result[0] = index | |
elsif !result[1] && percentage >= finish | |
result[1] = index | |
return result | |
end | |
end | |
end | |
def percentile(x, type = nil) | |
bucket_value(percentile_index(x, type == :upper), type) | |
end | |
# Returns a percentile interval, i.e. the lower bound and the upper bound of the values | |
# that represent the x%-interval for the bucketized dataset. | |
# | |
# A 90% interval means that 5% of the values would have been lower than the lower bound and | |
# 5% would have been higher than the upper bound, leaving 90% of the values within the bounds. | |
# You can also provide a Range to specify the lower bound and upper bound percentages (e.g. 5..95). | |
def percentile_interval(x) | |
case x | |
when Range | |
indices = percentile_indices(x.begin, x.end) | |
Range.new(bucket_lower_bound(indices[0]), bucket_upper_bound(indices[1])) | |
when Numeric | |
percentile_interval(Range.new((100 - x) / 2, (100 - (100 - x) / 2))) | |
else | |
raise 'What does it mean?' | |
end | |
end | |
######## | |
# TEST # | |
######## | |
bucketize(0.2) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.3) | |
bucketize(0.4) | |
puts "Median: #{percentile(50, :average)}" | |
puts | |
puts "90% interval: #{percentile_interval(90)}" | |
puts "80% interval: #{percentile_interval(80)}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment