Created
February 26, 2014 07:49
-
-
Save YoshihitoAso/9225377 to your computer and use it in GitHub Desktop.
[MeCab][Python]mecabを利用して日本語文章をnaive bayes分類するサンプル
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import MeCab | |
import math | |
class Bayesian(object): | |
term_count = {} | |
cat_count = {} | |
def __init__(self): | |
pass | |
def MecabSpliter(self, text): | |
m = MeCab.Tagger("-Ochasen") | |
n = m.parseToNode(text) | |
words = {} | |
while n.next is not None: | |
if ((n.posid >= 36) or (n.posid >=10 and n.posid <= 12) or (n.posid >= 31 and n.posid <= 33)): | |
words.setdefault(n.surface.lower(), 0) | |
words[n.surface.lower()] += 1 | |
#print str(n.posid) + "\t" + str(n.surface) + "\t" + str(n.feature) | |
n = n.next | |
return words | |
def text2vec(self, text, spliter=None): | |
result = self.MecabSpliter(text) | |
return result | |
def train(self, wv, cat): | |
for term, cnt in wv.iteritems(): | |
#print term, cnt | |
if not self.term_count.has_key(term): | |
self.term_count[term] = {} | |
if term in self.term_count[term]: | |
self.term_count[term][cat] += cnt | |
else: | |
self.term_count[term][cat] = cnt | |
if term in self.cat_count: | |
self.cat_count[cat] += 1 | |
else: | |
self.cat_count[cat] = 1 | |
def catProb(self, cat): | |
total = 0 | |
for k in self.cat_count: | |
total += self.cat_count[k] | |
return 1.0 * self.cat_count[cat] / total | |
def termCnt(self, term, cat): | |
if term in self.term_count: | |
if cat in self.term_count[term]: | |
return self.term_count[term][cat] | |
else: | |
return 0 | |
else: | |
return 0 | |
def termProb(self, term, cat): | |
return self.termCnt(term, cat) / self.cat_count[cat] | |
def predict(self, vec): | |
scores = {} | |
for cat in self.cat_count: | |
scores[cat] = self.score(vec, cat) | |
return scores | |
def score(self, vec, cat): | |
#print "tt", self.catProb(cat) | |
cat_prob = math.log(self.catProb(cat) * 1.0) | |
not_likely = 1.0 / (self.total_term_count() * 10) | |
doc_prob = 0 | |
for term, count in vec.items(): | |
#print term, count | |
doc_prob += math.log(self.termProb(term, cat) or not_likely) * count | |
return cat_prob + doc_prob | |
def total_term_count(self): | |
cnt = 0 | |
for k in self.cat_count: | |
cnt += self.cat_count[k] | |
return cnt | |
if __name__ == '__main__': | |
filter = Bayesian() | |
inputs = [ | |
{"text":"宅麺を食べた。", "judge":"ramen"}, | |
{"text":"カップラーメンが美味しかった。", "judge":"ramen"}, | |
{"text":"味噌ラーメン美味しい。", "judge":"ramen"}, | |
{"text":"家で袋ラーメンを作った。", "judge":"ramen"}, | |
{"text":"宅麺について考えている。", "judge":"ramen"}, | |
{"text":"富士そばは美味しい。", "judge":"soba"}, | |
{"text":"富士そばでカレー蕎麦食べた。", "judge":"soba"}, | |
{"text":"今日も富士そばである。", "judge":"soba"}, | |
{"text":"富士そばは演歌がかかっている。", "judge":"soba"}, | |
{"text":"あなたのおそばに、富士そば", "judge":"soba"}, | |
{"text":"富士そばはダイタングループだ。", "judge":"soba"}, | |
{"text":"名代富士そば。", "judge":"soba"}, | |
{"text":"浅草で蕎麦を食べた。", "judge":"soba"}, | |
{"text":"名代 富士そば 大宮東口店6月5日(火)新規開店!", "judge":"soba"}, | |
] | |
for input in inputs: | |
words = filter.text2vec(input['text']) | |
filter.train(words, input['judge']) | |
print "============" | |
srcs = [ | |
'味噌ラーメンを食べたいなぁ。', | |
'カップラーメンの工場見学に行った。', | |
'富士そばについて考えている。', | |
'蕎麦食いたい。', | |
'ラーメン食いたい。', | |
'関西にも富士そばを出店すべき。', | |
'富士そばのfacebookページを見ている。', | |
'海外生活が続くと富士そばが食べたくなる。', | |
'富士そばの出汁について考えている。', | |
] | |
for src in srcs: | |
words = filter.text2vec(src) | |
porb = filter.predict(words) | |
is_first = True | |
judge_result = {} | |
for k,v in sorted(porb.items(), key=lambda x:x[1], reverse=True): | |
if is_first == True: | |
judge_result["tag"] = k | |
judge_result["score"] = v | |
is_first = False | |
print str(judge_result["tag"]) + ", " + str(judge_result["score"]) + ", " + src |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment