Created
April 13, 2009 13:53
-
-
Save mumoshu/94452 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "forwardable" | |
module NaiveBayes | |
class Class | |
extend Forwardable | |
attr_reader :name | |
attr_accessor :frequency | |
def_delegators :@likelihood, :[], :[]= | |
def initialize(name) | |
@name = name | |
@frequency = 0 | |
@likelihood = Hash.new(0) | |
end | |
end | |
class Classifier | |
attr_reader :classes | |
def initialize | |
@classes = Hash.new {|h,k| h[k] = NaiveBayes::Class.new(k) } | |
end | |
def train(c,doc) | |
cls = @classes[c] | |
cls.frequency += 1 # => 1, 2, 3, 1, 2, 3 | |
doc.each {|term, freq| | |
cls[term] += freq | |
} | |
cls | |
end | |
def classify(doc) | |
@classes.values.sort_by {|c| | |
c.frequency * doc.inject(0) {|likelihood,(term,freq)| | |
likelihood + (c[term]) ** freq # => 0, 3, 4, 4, 4, 4, 4, 7, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2 | |
} | |
}.last.name | |
end | |
end | |
class Object | |
def initialize(obj) | |
@obj = obj | |
end | |
def freq | |
@obj\ | |
.split(" ")\ | |
.inject(Hash.new {|h,k| h[k]=0 }) {|doc,term| doc[term] += 1; doc } | |
end | |
end | |
def self._(obj) | |
Object.new(obj) | |
end | |
def self.test | |
# data set | |
data_set = [ | |
{ | |
:class => :mac, | |
:body => "How do I customize my iPod settings?" | |
}, | |
{ | |
:class => :mac, | |
:body => "How do I set up an AirPort wireless network?" | |
}, | |
{ | |
:class => :mac, | |
:body => "How do I set up Mac OS X Mail?" | |
}, | |
{ | |
:class => :windows, | |
:body => "You must be running Microsoft Internet Explorer 5 or later." | |
}, | |
{ | |
:class => :windows, | |
:body => "You can obtain updates from the Microsoft Download Center." | |
}, | |
{ | |
:class => :windows, | |
:body => "Get Office Live Basics for your business today." | |
} | |
] | |
# test set | |
test_set = [ | |
"How do I sync audio and video to my iPod?", | |
"We empower your business now!" | |
] | |
# training | |
classifier = NaiveBayes::Classifier.new | |
data_set.map {|data| | |
c = data[:class] # => :mac, :mac, :mac, :windows, :windows, :windows | |
d = _(data[:body]).freq # => {"do"=>1, "my"=>1, "How"=>1, "iPod"=>1, "customize"=>1, "settings?"=>1, "I"=>1}, {"do"=>1, "AirPort"=>1, "an"=>1, "set"=>1, "up"=>1, "How"=>1, "network?"=>1, "wireless"=>1, "I"=>1}, {"do"=>1, "X"=>1, "set"=>1, "OS"=>1, "Mac"=>1, "up"=>1, "How"=>1, "I"=>1, "Mail?"=>1}, {"running"=>1, "or"=>1, "Internet"=>1, "be"=>1, "later."=>1, "Microsoft"=>1, "Explorer"=>1, "must"=>1, "You"=>1, "5"=>1}, {"Download"=>1, "from"=>1, "updates"=>1, "Center."=>1, "Microsoft"=>1, "can"=>1, "the"=>1, "obtain"=>1, "You"=>1}, {"Get"=>1, "Live"=>1, "business"=>1, "today."=>1, "Basics"=>1, "your"=>1, "Office"=>1, "for"=>1} | |
[c,d] | |
}.each {|c,doc| | |
classifier.train(c,doc) # => #<NaiveBayes::Class:0x3840de8 @name=:mac, @likelihood={"do"=>1, "my"=>1, "How"=>1, "iPod"=>1, "customize"=>1, "settings?"=>1, "I"=>1}, @frequency=1>, #<NaiveBayes::Class:0x3840de8 @name=:mac, @likelihood={"do"=>2, "my"=>1, "set"=>1, "an"=>1, "AirPort"=>1, "up"=>1, "How"=>2, "iPod"=>1, "customize"=>1, "wireless"=>1, "network?"=>1, "settings?"=>1, "I"=>2}, @frequency=2>, #<NaiveBayes::Class:0x3840de8 @name=:mac, @likelihood={"do"=>3, "my"=>1, "X"=>1, "set"=>2, "an"=>1, "AirPort"=>1, "Mac"=>1, "OS"=>1, "up"=>2, "How"=>3, "iPod"=>1, "customize"=>1, "wireless"=>1, "network?"=>1, "settings?"=>1, "I"=>3, "Mail?"=>1}, @frequency=3>, #<NaiveBayes::Class:0x383e8a4 @name=:windows, @likelihood={"running"=>1, "be"=>1, "Internet"=>1, "or"=>1, "later."=>1, "Microsoft"=>1, "You"=>1, "must"=>1, "Explorer"=>1, "5"=>1}, @frequency=1>, #<NaiveBayes::Class:0x383e8a4 @name=:windows, @likelihood={"running"=>1, "updates"=>1, "from"=>1, "Download"=>1, "be"=>1, "Internet"=>1, "or"=>1, "later."=>1, "can"=>1, "Center."=>1, "Microsoft"=>2, "the"=>1, "obtain"=>1, "You"=>2, "must"=>1, "Explorer"=>1, "5"=>1}, @frequency=2>, #<NaiveBayes::Class:0x383e8a4 @name=:windows, @likelihood={"Get"=>1, "Live"=>1, "business"=>1, "running"=>1, "updates"=>1, "from"=>1, "Download"=>1, "be"=>1, "Internet"=>1, "or"=>1, "later."=>1, "can"=>1, "Center."=>1, "Microsoft"=>2, "today."=>1, "the"=>1, "Basics"=>1, "Office"=>1, "your"=>1, "for"=>1, "obtain"=>1, "You"=>2, "must"=>1, "Explorer"=>1, "5"=>1}, @frequency=3> | |
} | |
# testing | |
test_set.each {|data| | |
data # => "How do I sync audio and video to my iPod?", "We empower your business now!" | |
cls = classifier.classify _(data).freq # => :mac, :windows | |
puts cls | |
} | |
end | |
end | |
NaiveBayes::test | |
# >> mac | |
# >> windows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment