Skip to content

Instantly share code, notes, and snippets.

@yuskesuzki
Created November 26, 2011 13:40
Show Gist options
  • Save yuskesuzki/1395683 to your computer and use it in GitHub Desktop.
Save yuskesuzki/1395683 to your computer and use it in GitHub Desktop.
$KCODE="u"
require "rubygems"
require "open-uri"
require "nkf"
require "kconv"
require "jcode"
require "rexml/document"
class Classifier
def initialize
@fc = {}
@cc = {}
end
def getfeatures(item)
return item
end
def incf(f, category)
f = f.tr('ァ-ン', 'ぁ-ん')
@fc[f] = {} if @fc[f].nil?
@fc[f][category] = 0 if @fc[f][category].nil?
@fc[f][category] += 1
end
def incc(category)
@cc[category] = 0 if @cc[category].nil?
@cc[category] += 1
end
def fcount(f, category)
f = f.tr('ァ-ン', 'ぁ-ん')
if @fc.key?(f) && @fc[f].key?(category)
return @fc[f][category].to_f
end
return 0.to_f
end
def catcount(category)
if @cc.key?(category)
return @cc[category].to_f
end
return 0.to_f
end
def totalcount
total = 0
@cc.values.map {|val| total += val}
return total
end
def categories
return @cc.keys
end
def features
return @fc.keys
end
def train(item, category)
features = getfeatures(item)
features.each do |f|
incf(f, category) unless f.nil?
end
incc(category)
return
end
def fprob(f, category)
return 0 if catcount(category) == 0
return fcount(f, category) / catcount(category)
end
def weightedprob(f, category, weight=1.0, ap=0.5)
basicprob = fprob(f, category)
totals = 0
categories().each do |c|
totals += fcount(f, c)
end
bp = ((weight*ap) + (totals*basicprob)) / (weight+totals)
return bp
end
end
class Classifier
def yahoo_text_analyze(item)
appid = "(replace your yahoo appid)"
pageuri = "http://jlp.yahooapis.jp/MAService/V1/parse"
stopwords = %w[ー ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ァ ィ ゥ ェ ォ ッ ャ ュ ョ]
item = item.tr('ァ-ン', 'ぁ-ん')
item = item.toutf8
result = Array.new
body = open("#{pageuri}?appid=#{appid}&results=ma&sentence=" + URI.encode(item))
doc = REXML::Document.new(body).elements['ResultSet/ma_result/word_list/']
doc.elements.each('word') do |item|
word = Hash.new
item.elements.each do |property|
if stopwords.index(property.text).nil? == true
word[property.name] = property.text
end
end
result << word if word["surface"].nil? == false
end
result.map!{|x| x["surface"] }
return result
end
alias_method :getfeatures, :yahoo_text_analyze
end
class Naivebayes < Classifier
def docprob(item, category)
features = getfeatures(item)
p = 1
features.each do |f|
prob = weightedprob(f, category)
p *= prob
end
return p
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment