Created
March 1, 2014 04:06
-
-
Save maliabadi/9284951 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
include Math | |
class FrequencyDistribution | |
attr :samples, :data | |
def initialize samples=[] | |
@data = {} | |
samples.each(&method(:add)) | |
end | |
def add(sample) | |
if data[sample].nil? | |
data[sample] = 1 | |
else | |
data[sample] += 1 | |
end | |
end | |
def get(key) | |
data[key] | |
end | |
def total_outcomes | |
data.values.reduce(:+) | |
end | |
def total_sample_values | |
data.size | |
end | |
def hapaxes | |
data.keys.select {|key| data[key] == 0} | |
end | |
def frequency_to_sample_count | |
reverse_hash = {} | |
data.values.map do |count| | |
unless reverse_hash[count].nil? | |
reverse_hash[count] += 1 | |
else | |
reverse_hash[count] = 1 | |
end | |
end | |
reverse_hash | |
end | |
def cumulative_frequencies samples=nil, &block | |
cf = 0.0 | |
unless samples | |
samples = data.keys | |
end | |
samples.each do |sample| | |
cf += data[sample] | |
yield cf | |
end | |
end | |
def freq(sample) | |
return 0.0 if total_outcomes.to_f == 0.0 | |
(data[sample].to_f / total_outcomes) | |
end | |
def max | |
data.keys.select {|x| data[x] == data.values.max}.first | |
end | |
end | |
def entropy(labels) | |
freq_dist = FrequencyDistribution.new(labels) | |
probs = freq_dist.data.keys.map(&freq_dist.method(:freq)) | |
entropy_reduction(probs) | |
end | |
def entropy_reduction(probs) | |
r = probs.map do |p| | |
unless p.to_f == 0.0 | |
p * log2(p) | |
else | |
0 | |
end | |
end | |
r.reduce(:+) * -1 | |
end | |
# Let F(w) be the fraction of the | |
# documents containing the word w | |
def f(objects=[], w) | |
present = objects.select do |obj| | |
obj[:features].include?(w) | |
end | |
present.to_f / objects.count | |
end | |
# Let P(i) be the global | |
# probability of class i | |
def p(objects=[], i) | |
flattened_types = objects.map { |x| x[:type] } | |
freq_dist = FrequencyDistribution.new(flattened_types) | |
freq_dist.freq(i) | |
end | |
# let pi(w) be the probability of class i, given that | |
# the document contains the word w. | |
def pi(objects=[], i, w) | |
fd = FrequencyDistribution.new() | |
objects.each do |obj| | |
if obj[:features].include?(w) | |
fd.add(obj[:type]) | |
end | |
end | |
return fd.freq(i) | |
end | |
def entropy_of_classes(objects=[]) | |
flattened_types = objects.map { |x| x[:type] }.flatten.uniq | |
entropy_reduction(flattened_types.map {|x| p(objects, x)}) | |
end | |
def entropy_of_classes_given_feature(objects=[], w) | |
flattened_types = objects.map { |x| x[:type] }.flatten.uniq | |
probs = flattened_types.map {|x| pi(objects, x, w)} | |
entropy_reduction(probs) | |
end | |
def information_gain(objects=[], w) | |
entropy_of_classes(objects) - entropy_of_classes_given_feature(objects, w) | |
end | |
a = {:type => 'a', :features => [3,9,9,9]} | |
b = {:type => 'a', :features => [1]} | |
c = {:type => 'b', :features => [2,3,4,5,6,7,8,9]} | |
d = {:type => 'c', :features => [3,4,5,6,7,8]} | |
sample_objects = [a,b,c,d] | |
[1,2,3,4,5,6,7,8,9].each do |num| | |
puts "NUM = #{num}" | |
puts information_gain(sample_objects, num) | |
puts "" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment