Skip to content

Instantly share code, notes, and snippets.

@obrok
Created June 26, 2018 13:53
Show Gist options
  • Save obrok/1c299c46c250e7612b79924f2b082c86 to your computer and use it in GitHub Desktop.
Save obrok/1c299c46c250e7612b79924f2b082c86 to your computer and use it in GitHub Desktop.
require "rubygems"
require "set"
require "distribution"
gaussian = Distribution::Normal.rngu
def sample_rows(data, sample_rate)
sample_size = data.length * sample_rate
sample_size.to_i.times.map { data.sample }
end
def run(data, sample_fun)
counts = sample_fun.call(data).
group_by { |row| row[:value] }.
map { |_, rows| rows.length }
counts.select { |x| x == 1 }.length / counts.length.to_f
end
def prob_isolating(cutoff, data_fun, sample_fun)
trues = 100.times.map do
data = data_fun.call()
run(data, sample_fun) >= cutoff
end.count { |x| x }
trues / 100.0
end
flat_data = lambda { (1..10000).map { |i| {user_id: i, value: i / 3} } }
gaussian_data = lambda { (1..10000).map { |i| {user_id: i, value: (gaussian.call.abs * 400).to_i} } }
power_data = lambda { (1..10000).map { |i| {user_id: i, value: i > 5000 ? 0 : i} } }
puts "FLAT"
[0.001, 0.01, 0.1].each do |sample_rate|
puts "sample_rate: #{sample_rate}, cutoff: 0.5"
p prob_isolating(0.5, flat_data, lambda { |data| sample_rows(data, sample_rate) })
puts "sample_rate: #{sample_rate}, cutoff: 0.9"
p prob_isolating(0.9, flat_data, lambda { |data| sample_rows(data, sample_rate) })
end
puts "GAUSSIAN"
[0.001, 0.01, 0.1].each do |sample_rate|
puts "sample_rate: #{sample_rate}, cutoff: 0.5"
p prob_isolating(0.5, gaussian_data, lambda { |data| sample_rows(data, sample_rate) })
puts "sample_rate: #{sample_rate}, cutoff: 0.9"
p prob_isolating(0.9, gaussian_data, lambda { |data| sample_rows(data, sample_rate) })
end
puts "POWER LAW"
[0.001, 0.01, 0.1].each do |sample_rate|
puts "sample_rate: #{sample_rate}, cutoff: 0.5"
p prob_isolating(0.5, power_data, lambda { |data| sample_rows(data, sample_rate) })
puts "sample_rate: #{sample_rate}, cutoff: 0.9"
p prob_isolating(0.9, power_data, lambda { |data| sample_rows(data, sample_rate) })
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment