Skip to content

Instantly share code, notes, and snippets.

@YoshihitoAso
Created February 26, 2014 07:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YoshihitoAso/9225377 to your computer and use it in GitHub Desktop.
Save YoshihitoAso/9225377 to your computer and use it in GitHub Desktop.
[MeCab][Python]mecabを利用して日本語文章をnaive bayes分類するサンプル
# coding: utf-8
import MeCab
import math
class Bayesian(object):
term_count = {}
cat_count = {}
def __init__(self):
pass
def MecabSpliter(self, text):
m = MeCab.Tagger("-Ochasen")
n = m.parseToNode(text)
words = {}
while n.next is not None:
if ((n.posid >= 36) or (n.posid >=10 and n.posid <= 12) or (n.posid >= 31 and n.posid <= 33)):
words.setdefault(n.surface.lower(), 0)
words[n.surface.lower()] += 1
#print str(n.posid) + "\t" + str(n.surface) + "\t" + str(n.feature)
n = n.next
return words
def text2vec(self, text, spliter=None):
result = self.MecabSpliter(text)
return result
def train(self, wv, cat):
for term, cnt in wv.iteritems():
#print term, cnt
if not self.term_count.has_key(term):
self.term_count[term] = {}
if term in self.term_count[term]:
self.term_count[term][cat] += cnt
else:
self.term_count[term][cat] = cnt
if term in self.cat_count:
self.cat_count[cat] += 1
else:
self.cat_count[cat] = 1
def catProb(self, cat):
total = 0
for k in self.cat_count:
total += self.cat_count[k]
return 1.0 * self.cat_count[cat] / total
def termCnt(self, term, cat):
if term in self.term_count:
if cat in self.term_count[term]:
return self.term_count[term][cat]
else:
return 0
else:
return 0
def termProb(self, term, cat):
return self.termCnt(term, cat) / self.cat_count[cat]
def predict(self, vec):
scores = {}
for cat in self.cat_count:
scores[cat] = self.score(vec, cat)
return scores
def score(self, vec, cat):
#print "tt", self.catProb(cat)
cat_prob = math.log(self.catProb(cat) * 1.0)
not_likely = 1.0 / (self.total_term_count() * 10)
doc_prob = 0
for term, count in vec.items():
#print term, count
doc_prob += math.log(self.termProb(term, cat) or not_likely) * count
return cat_prob + doc_prob
def total_term_count(self):
cnt = 0
for k in self.cat_count:
cnt += self.cat_count[k]
return cnt
if __name__ == '__main__':
filter = Bayesian()
inputs = [
{"text":"宅麺を食べた。", "judge":"ramen"},
{"text":"カップラーメンが美味しかった。", "judge":"ramen"},
{"text":"味噌ラーメン美味しい。", "judge":"ramen"},
{"text":"家で袋ラーメンを作った。", "judge":"ramen"},
{"text":"宅麺について考えている。", "judge":"ramen"},
{"text":"富士そばは美味しい。", "judge":"soba"},
{"text":"富士そばでカレー蕎麦食べた。", "judge":"soba"},
{"text":"今日も富士そばである。", "judge":"soba"},
{"text":"富士そばは演歌がかかっている。", "judge":"soba"},
{"text":"あなたのおそばに、富士そば", "judge":"soba"},
{"text":"富士そばはダイタングループだ。", "judge":"soba"},
{"text":"名代富士そば。", "judge":"soba"},
{"text":"浅草で蕎麦を食べた。", "judge":"soba"},
{"text":"名代 富士そば 大宮東口店6月5日(火)新規開店!", "judge":"soba"},
]
for input in inputs:
words = filter.text2vec(input['text'])
filter.train(words, input['judge'])
print "============"
srcs = [
'味噌ラーメンを食べたいなぁ。',
'カップラーメンの工場見学に行った。',
'富士そばについて考えている。',
'蕎麦食いたい。',
'ラーメン食いたい。',
'関西にも富士そばを出店すべき。',
'富士そばのfacebookページを見ている。',
'海外生活が続くと富士そばが食べたくなる。',
'富士そばの出汁について考えている。',
]
for src in srcs:
words = filter.text2vec(src)
porb = filter.predict(words)
is_first = True
judge_result = {}
for k,v in sorted(porb.items(), key=lambda x:x[1], reverse=True):
if is_first == True:
judge_result["tag"] = k
judge_result["score"] = v
is_first = False
print str(judge_result["tag"]) + ", " + str(judge_result["score"]) + ", " + src
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment