Skip to content

Instantly share code, notes, and snippets.

@maliabadi
Created March 1, 2014 04:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maliabadi/9284951 to your computer and use it in GitHub Desktop.
Save maliabadi/9284951 to your computer and use it in GitHub Desktop.
include Math
class FrequencyDistribution
attr :samples, :data
def initialize samples=[]
@data = {}
samples.each(&method(:add))
end
def add(sample)
if data[sample].nil?
data[sample] = 1
else
data[sample] += 1
end
end
def get(key)
data[key]
end
def total_outcomes
data.values.reduce(:+)
end
def total_sample_values
data.size
end
def hapaxes
data.keys.select {|key| data[key] == 0}
end
def frequency_to_sample_count
reverse_hash = {}
data.values.map do |count|
unless reverse_hash[count].nil?
reverse_hash[count] += 1
else
reverse_hash[count] = 1
end
end
reverse_hash
end
def cumulative_frequencies samples=nil, &block
cf = 0.0
unless samples
samples = data.keys
end
samples.each do |sample|
cf += data[sample]
yield cf
end
end
def freq(sample)
return 0.0 if total_outcomes.to_f == 0.0
(data[sample].to_f / total_outcomes)
end
def max
data.keys.select {|x| data[x] == data.values.max}.first
end
end
def entropy(labels)
freq_dist = FrequencyDistribution.new(labels)
probs = freq_dist.data.keys.map(&freq_dist.method(:freq))
entropy_reduction(probs)
end
def entropy_reduction(probs)
r = probs.map do |p|
unless p.to_f == 0.0
p * log2(p)
else
0
end
end
r.reduce(:+) * -1
end
# Let F(w) be the fraction of the
# documents containing the word w
def f(objects=[], w)
present = objects.select do |obj|
obj[:features].include?(w)
end
present.to_f / objects.count
end
# Let P(i) be the global
# probability of class i
def p(objects=[], i)
flattened_types = objects.map { |x| x[:type] }
freq_dist = FrequencyDistribution.new(flattened_types)
freq_dist.freq(i)
end
# let pi(w) be the probability of class i, given that
# the document contains the word w.
def pi(objects=[], i, w)
fd = FrequencyDistribution.new()
objects.each do |obj|
if obj[:features].include?(w)
fd.add(obj[:type])
end
end
return fd.freq(i)
end
def entropy_of_classes(objects=[])
flattened_types = objects.map { |x| x[:type] }.flatten.uniq
entropy_reduction(flattened_types.map {|x| p(objects, x)})
end
def entropy_of_classes_given_feature(objects=[], w)
flattened_types = objects.map { |x| x[:type] }.flatten.uniq
probs = flattened_types.map {|x| pi(objects, x, w)}
entropy_reduction(probs)
end
def information_gain(objects=[], w)
entropy_of_classes(objects) - entropy_of_classes_given_feature(objects, w)
end
a = {:type => 'a', :features => [3,9,9,9]}
b = {:type => 'a', :features => [1]}
c = {:type => 'b', :features => [2,3,4,5,6,7,8,9]}
d = {:type => 'c', :features => [3,4,5,6,7,8]}
sample_objects = [a,b,c,d]
[1,2,3,4,5,6,7,8,9].each do |num|
puts "NUM = #{num}"
puts information_gain(sample_objects, num)
puts ""
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment